Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
a9b0005a7d Bump github.com/fsnotify/fsnotify from 1.9.0 to 1.10.1
Bumps [github.com/fsnotify/fsnotify](https://github.com/fsnotify/fsnotify) from 1.9.0 to 1.10.1.
- [Release notes](https://github.com/fsnotify/fsnotify/releases)
- [Changelog](https://github.com/fsnotify/fsnotify/blob/main/CHANGELOG.md)
- [Commits](https://github.com/fsnotify/fsnotify/compare/v1.9.0...v1.10.1)

---
updated-dependencies:
- dependency-name: github.com/fsnotify/fsnotify
  dependency-version: 1.10.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-06-15 14:45:54 +00:00
361 changed files with 3685 additions and 50102 deletions

View File

@@ -20,7 +20,7 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
@@ -59,12 +59,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Set up Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: true

View File

@@ -15,7 +15,7 @@ jobs:
pull-requests: write
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- uses: git-town/action@3d8b878379abb1ee393fb49865a28b4a6c2cd3b0 # v1.2.1

View File

@@ -16,12 +16,12 @@ jobs:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -48,7 +48,7 @@ jobs:
run: NETBIRD_STORE_ENGINE=${{ matrix.store }} CI=true go test -coverprofile=coverage.txt -tags=devcert -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE' -timeout 5m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird

View File

@@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
@@ -28,7 +28,7 @@ jobs:
id: test
env:
GO_VERSION: ${{ steps.goversion.outputs.version }}
uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
with:
usesh: true
copyback: false

View File

@@ -18,7 +18,7 @@ jobs:
management: ${{ steps.filter.outputs.management }}
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
@@ -30,7 +30,7 @@ jobs:
- 'management/**'
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -119,12 +119,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -162,7 +162,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird
@@ -175,12 +175,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -246,12 +246,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -290,7 +290,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird
@@ -306,12 +306,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -347,7 +347,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird
@@ -363,12 +363,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -407,7 +407,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird
@@ -424,12 +424,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -484,7 +484,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird
@@ -529,12 +529,12 @@ jobs:
prom/prometheus
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -579,11 +579,10 @@ jobs:
CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
NETBIRD_STORE_ENGINE=${{ matrix.store }} \
CI=true \
GIT_BRANCH=${{ github.ref_name }} \
go test -tags devcert -run=^$ -bench=. \
-exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
-timeout 20m ./management/... ./shared/management/... $(go list ./management/... ./shared/management/... | grep -v -e /management/server/http)
env:
GIT_BRANCH: ${{ github.ref_name }}
api_benchmark:
name: "Management / Benchmark (API)"
@@ -624,12 +623,12 @@ jobs:
prom/prometheus
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -674,13 +673,12 @@ jobs:
CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
NETBIRD_STORE_ENGINE=${{ matrix.store }} \
CI=true \
GIT_BRANCH=${{ github.ref_name }} \
go test -tags=benchmark \
-run=^$ \
-bench=. \
-exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
-timeout 20m ./management/server/http/...
env:
GIT_BRANCH: ${{ github.ref_name }}
api_integration_test:
name: "Management / Integration"
@@ -694,12 +692,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -736,7 +734,7 @@ jobs:
- name: Upload coverage reports to Codecov
if: matrix.arch == 'amd64'
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: netbirdio/netbird

View File

@@ -18,12 +18,12 @@ jobs:
runs-on: windows-latest
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
id: go
with:
go-version-file: "go.mod"

View File

@@ -15,13 +15,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: codespell
uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # v2.2
with:
ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals,flate,recordin,unparseable
ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals
skip: go.mod,go.sum,**/proxy/web/**
golangci:
strategy:
@@ -40,7 +40,7 @@ jobs:
timeout-minutes: 15
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Check for duplicate constants
@@ -48,7 +48,7 @@ jobs:
run: |
! awk '/const \(/,/)/{print $0}' management/server/activity/codes.go | grep -o '= [0-9]*' | sort | uniq -d | grep .
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false

View File

@@ -22,7 +22,7 @@ jobs:
runs-on: ${{ matrix.os }}
steps:
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

View File

@@ -16,11 +16,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
- name: Setup Android SDK
@@ -28,7 +28,7 @@ jobs:
with:
cmdline-tools-version: 8512546
- name: Setup Java
uses: actions/setup-java@ad2b38190b15e4d6bdf0c97fb4fca8412226d287
uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654
with:
java-version: "11"
distribution: "adopt"
@@ -54,11 +54,11 @@ jobs:
runs-on: macos-latest
steps:
- name: Checkout repository
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
- name: install gomobile

View File

@@ -9,13 +9,10 @@ on:
pull_request:
env:
SIGN_PIPE_VER: "v0.1.6"
GORELEASER_VER: "v2.16.0"
SIGN_PIPE_VER: "v0.1.5"
GORELEASER_VER: "v2.14.3"
PRODUCT_NAME: "NetBird"
COPYRIGHT: "NetBird GmbH"
flags: ""
SKIP_PUBLISH: "true"
SKIP_DOCKER_PUSH: "false"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.actor_id }}
@@ -27,7 +24,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
@@ -64,7 +61,7 @@ jobs:
if: steps.check_diff.outputs.diff_exists == 'true'
env:
GO_VERSION: ${{ steps.goversion.outputs.version }}
uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
with:
usesh: true
copyback: false
@@ -133,9 +130,11 @@ jobs:
windows_packages_artifact_url: ${{ steps.upload_windows_packages.outputs.artifact-url }}
macos_packages_artifact_url: ${{ steps.upload_macos_packages.outputs.artifact-url }}
ghcr_images: ${{ steps.tag_and_push_images.outputs.images_markdown }}
env:
flags: ""
steps:
- name: Checkout
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0 # It is required for GoReleaser to work properly
persist-credentials: false
@@ -144,29 +143,10 @@ jobs:
id: semver_parser
uses: netbirdio/shared-actions/actions/parse-semver@be5df6047383da2236e02243cceb857d8567c27e # v0.0.2
- name: Set snapshot flag
if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
run: |
echo "flags=--snapshot" >> $GITHUB_ENV
- name: Set build vars
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
run: |
if [[ "x-${{ steps.semver_parser.outputs.prerelease }}" == "x-" && "x-${{ github.repository }}" == "x-netbirdio/netbird" ]]; then
echo "x-${{ github.repository }}"
echo "x-${{ steps.semver_parser.outputs.prerelease }}"
echo "SKIP_PUBLISH=false" >> $GITHUB_ENV
else
echo "x-${{ github.repository }}"
echo "x-${{ steps.semver_parser.outputs.prerelease }}"
fi
if [[ "x-${{ github.repository }}" != "x-netbirdio/netbird" ]]; then
echo "SKIP_DOCKER_PUSH=true" >> $GITHUB_ENV
fi
- if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
run: echo "flags=--snapshot" >> $GITHUB_ENV
- name: Set up Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -181,14 +161,12 @@ jobs:
${{ runner.os }}-go-releaser-
- name: Install modules
run: go mod tidy
- name: run openapi generator
run: bash shared/management/http/api/generate.sh
- name: check git status
run: git --no-pager diff --exit-code
- name: Set up QEMU
uses: docker/setup-qemu-action@06116385d9baf250c9f4dcb4858b16962ea869c3 #v4.1.0
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a #v4.0.0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 #v4.1.0
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
- name: Login to Docker hub
if: github.event_name != 'pull_request'
uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0
@@ -221,7 +199,7 @@ jobs:
run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/manifest.xml -product-name ${{ env.PRODUCT_NAME }} -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/resources_windows_arm64.syso
- name: Run GoReleaser
id: goreleaser
uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
with:
version: ${{ env.GORELEASER_VER }}
args: release --clean ${{ env.flags }}
@@ -232,8 +210,6 @@ jobs:
UPLOAD_YUM_SECRET: ${{ secrets.PKG_UPLOAD_SECRET }}
GPG_RPM_KEY_FILE: ${{ env.GPG_RPM_KEY_FILE }}
NFPM_NETBIRD_RPM_PASSPHRASE: ${{ secrets.GPG_RPM_PASSPHRASE }}
SKIP_PUBLISH: ${{ env.SKIP_PUBLISH }}
SKIP_DOCKER_PUSH: ${{ env.SKIP_DOCKER_PUSH }}
- name: Verify RPM signatures
run: |
docker run --rm -v $(pwd)/dist:/dist fedora:41 bash -c '
@@ -347,7 +323,7 @@ jobs:
release_ui_artifact_url: ${{ steps.upload_release_ui.outputs.artifact-url }}
steps:
- name: Checkout
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0 # It is required for GoReleaser to work properly
persist-credentials: false
@@ -356,25 +332,11 @@ jobs:
id: semver_parser
uses: netbirdio/shared-actions/actions/parse-semver@be5df6047383da2236e02243cceb857d8567c27e # v0.0.2
- name: Set snapshot flag
if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
run: |
echo "flags=--snapshot" >> $GITHUB_ENV
- name: Set build vars
if: ${{ startsWith(github.ref, 'refs/tags/v') }}
run: |
if [[ "x-${{ steps.semver_parser.outputs.prerelease }}" == "x-" && "x-${{ github.repository }}" == "x-netbirdio/netbird" ]]; then
echo "x-${{ github.repository }}"
echo "x-${{ steps.semver_parser.outputs.prerelease }}"
echo "SKIP_PUBLISH=false" >> $GITHUB_ENV
else
echo "x-${{ github.repository }}"
echo "x-${{ steps.semver_parser.outputs.prerelease }}"
fi
- if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
run: echo "flags=--snapshot" >> $GITHUB_ENV
- name: Set up Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -420,7 +382,7 @@ jobs:
run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/ui/manifest.xml -product-name ${{ env.PRODUCT_NAME }}-"UI" -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/ui/resources_windows_arm64.syso
- name: Run GoReleaser
uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
with:
version: ${{ env.GORELEASER_VER }}
args: release --config .goreleaser_ui.yaml --clean ${{ env.flags }}
@@ -431,7 +393,6 @@ jobs:
UPLOAD_YUM_SECRET: ${{ secrets.PKG_UPLOAD_SECRET }}
GPG_RPM_KEY_FILE: ${{ env.GPG_RPM_KEY_FILE }}
NFPM_NETBIRD_UI_RPM_PASSPHRASE: ${{ secrets.GPG_RPM_PASSPHRASE }}
SKIP_PUBLISH: ${{ env.SKIP_PUBLISH }}
- name: Verify RPM signatures
run: |
docker run --rm -v $(pwd)/dist:/dist fedora:41 bash -c '
@@ -464,12 +425,12 @@ jobs:
- if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
run: echo "flags=--snapshot" >> $GITHUB_ENV
- name: Checkout
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0 # It is required for GoReleaser to work properly
persist-credentials: false
- name: Set up Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
cache: false
@@ -488,7 +449,7 @@ jobs:
run: git --no-pager diff --exit-code
- name: Run GoReleaser
id: goreleaser
uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
with:
version: ${{ env.GORELEASER_VER }}
args: release --config .goreleaser_ui_darwin.yaml --clean ${{ env.flags }}
@@ -522,7 +483,7 @@ jobs:
downloadPath: '${{ github.workspace }}\temp'
steps:
- name: Checkout
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
@@ -534,13 +495,13 @@ jobs:
run: echo "C:\Program Files\7-Zip" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Download release artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
with:
name: release
path: release
- name: Download UI release artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
with:
name: release-ui
path: release-ui

View File

@@ -68,12 +68,12 @@ jobs:
run: sudo apt-get install -y curl
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
@@ -207,7 +207,7 @@ jobs:
- name: Build management docker image
working-directory: management
run: |
docker build -t netbirdio/management:latest --build-arg TARGETPLATFORM=. .
docker build -t netbirdio/management:latest .
- name: Build signal binary
working-directory: signal
@@ -216,7 +216,7 @@ jobs:
- name: Build signal docker image
working-directory: signal
run: |
docker build -t netbirdio/signal:latest --build-arg TARGETPLATFORM=. .
docker build -t netbirdio/signal:latest .
- name: Build relay binary
working-directory: relay
@@ -225,7 +225,7 @@ jobs:
- name: Build relay docker image
working-directory: relay
run: |
docker build -t netbirdio/relay:latest --build-arg TARGETPLATFORM=. .
docker build -t netbirdio/relay:latest .
- name: run docker compose up
working-directory: infrastructure_files/artifacts
@@ -256,7 +256,7 @@ jobs:
run: sudo apt-get install -y jq
- name: Checkout code
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

View File

@@ -19,11 +19,11 @@ jobs:
GOARCH: wasm
steps:
- name: Checkout repository
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
- name: Install dependencies
@@ -44,11 +44,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Install Go
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version-file: "go.mod"
- name: Build Wasm client

View File

@@ -1,7 +1,5 @@
version: 2
env:
- SKIP_PUBLISH={{ if index .Env "SKIP_PUBLISH" }}{{ .Env.SKIP_PUBLISH }}{{ else }}true{{ end }}
- SKIP_DOCKER_PUSH={{ if index .Env "SKIP_DOCKER_PUSH" }}{{ .Env.SKIP_DOCKER_PUSH }}{{ else }}false{{ end }}
project_name: netbird
builds:
- id: netbird-wasm
@@ -76,8 +74,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -92,8 +88,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -108,8 +102,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -130,8 +122,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -146,8 +136,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -162,8 +150,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X main.Version={{.Version}} -X main.Commit={{.Commit}} -X main.BuildDate={{.CommitDate}}
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -184,8 +170,6 @@ builds:
- amd64
- arm64
- arm
goarm:
- 7
ldflags:
- -s -w -X github.com/netbirdio/netbird/version.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.CommitDate}} -X main.builtBy=goreleaser
mod_timestamp: "{{ .CommitTimestamp }}"
@@ -238,192 +222,670 @@ nfpms:
rpm:
signature:
key_file: '{{ if index .Env "GPG_RPM_KEY_FILE" }}{{ .Env.GPG_RPM_KEY_FILE }}{{ end }}'
dockers_v2:
- id: netbird
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird
images:
- netbirdio/netbird
- ghcr.io/netbirdio/netbird
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: client/Dockerfile
extra_files:
- client/netbird-entrypoint.sh
platforms:
- linux/amd64
- linux/arm64
- linux/arm/6
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: netbird-rootless
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird
images:
- netbirdio/netbird
- ghcr.io/netbirdio/netbird
tags:
- "v{{ .Version }}-rootless"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: client/Dockerfile-rootless
extra_files:
- client/netbird-entrypoint.sh
platforms:
- linux/amd64
- linux/arm64
- linux/arm/6
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: relay
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-relay
images:
- netbirdio/relay
- ghcr.io/netbirdio/relay
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: relay/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: signal
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-signal
images:
- netbirdio/signal
- ghcr.io/netbirdio/signal
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: signal/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: management
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-mgmt
images:
- netbirdio/management
- ghcr.io/netbirdio/management
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: management/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: upload
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-upload
images:
- netbirdio/upload
- ghcr.io/netbirdio/upload
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: upload-server/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: netbird-server
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-server
images:
- netbirdio/netbird-server
- ghcr.io/netbirdio/netbird-server
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: combined/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
- id: netbird-proxy
disable: "{{ .Env.SKIP_DOCKER_PUSH }}"
ids:
- netbird-proxy
images:
- netbirdio/reverse-proxy
- ghcr.io/netbirdio/reverse-proxy
tags:
- "{{ .Version }}"
- "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
dockerfile: proxy/Dockerfile
platforms:
- linux/amd64
- linux/arm64
- linux/arm
annotations:
"org.opencontainers.image.created": "{{.Date}}"
"org.opencontainers.image.title": "{{.ProjectName}}"
"org.opencontainers.image.version": "{{.Version}}"
"org.opencontainers.image.revision": "{{.FullCommit}}"
"org.opencontainers.image.source": "{{.GitURL}}"
"maintainer": "dev@netbird.io"
dockers:
- image_templates:
- netbirdio/netbird:{{ .Version }}-amd64
- ghcr.io/netbirdio/netbird:{{ .Version }}-amd64
ids:
- netbird
goarch: amd64
use: buildx
dockerfile: client/Dockerfile
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm64v8
ids:
- netbird
goarch: arm64
use: buildx
dockerfile: client/Dockerfile
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm
ids:
- netbird
goarch: arm
goarm: 6
use: buildx
dockerfile: client/Dockerfile
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird:{{ .Version }}-rootless-amd64
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-amd64
ids:
- netbird
goarch: amd64
use: buildx
dockerfile: client/Dockerfile-rootless
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird:{{ .Version }}-rootless-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm64v8
ids:
- netbird
goarch: arm64
use: buildx
dockerfile: client/Dockerfile-rootless
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird:{{ .Version }}-rootless-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm
ids:
- netbird
goarch: arm
goarm: 6
use: buildx
dockerfile: client/Dockerfile-rootless
extra_files:
- client/netbird-entrypoint.sh
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/relay:{{ .Version }}-amd64
- ghcr.io/netbirdio/relay:{{ .Version }}-amd64
ids:
- netbird-relay
goarch: amd64
use: buildx
dockerfile: relay/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/relay:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/relay:{{ .Version }}-arm64v8
ids:
- netbird-relay
goarch: arm64
use: buildx
dockerfile: relay/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/relay:{{ .Version }}-arm
- ghcr.io/netbirdio/relay:{{ .Version }}-arm
ids:
- netbird-relay
goarch: arm
goarm: 6
use: buildx
dockerfile: relay/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/signal:{{ .Version }}-amd64
- ghcr.io/netbirdio/signal:{{ .Version }}-amd64
ids:
- netbird-signal
goarch: amd64
use: buildx
dockerfile: signal/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/signal:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/signal:{{ .Version }}-arm64v8
ids:
- netbird-signal
goarch: arm64
use: buildx
dockerfile: signal/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/signal:{{ .Version }}-arm
- ghcr.io/netbirdio/signal:{{ .Version }}-arm
ids:
- netbird-signal
goarch: arm
goarm: 6
use: buildx
dockerfile: signal/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-amd64
- ghcr.io/netbirdio/management:{{ .Version }}-amd64
ids:
- netbird-mgmt
goarch: amd64
use: buildx
dockerfile: management/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/management:{{ .Version }}-arm64v8
ids:
- netbird-mgmt
goarch: arm64
use: buildx
dockerfile: management/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-arm
- ghcr.io/netbirdio/management:{{ .Version }}-arm
ids:
- netbird-mgmt
goarch: arm
goarm: 6
use: buildx
dockerfile: management/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-debug-amd64
- ghcr.io/netbirdio/management:{{ .Version }}-debug-amd64
ids:
- netbird-mgmt
goarch: amd64
use: buildx
dockerfile: management/Dockerfile.debug
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-debug-arm64v8
- ghcr.io/netbirdio/management:{{ .Version }}-debug-arm64v8
ids:
- netbird-mgmt
goarch: arm64
use: buildx
dockerfile: management/Dockerfile.debug
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/management:{{ .Version }}-debug-arm
- ghcr.io/netbirdio/management:{{ .Version }}-debug-arm
ids:
- netbird-mgmt
goarch: arm
goarm: 6
use: buildx
dockerfile: management/Dockerfile.debug
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/upload:{{ .Version }}-amd64
- ghcr.io/netbirdio/upload:{{ .Version }}-amd64
ids:
- netbird-upload
goarch: amd64
use: buildx
dockerfile: upload-server/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/upload:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/upload:{{ .Version }}-arm64v8
ids:
- netbird-upload
goarch: arm64
use: buildx
dockerfile: upload-server/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/upload:{{ .Version }}-arm
- ghcr.io/netbirdio/upload:{{ .Version }}-arm
ids:
- netbird-upload
goarch: arm
goarm: 6
use: buildx
dockerfile: upload-server/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird-server:{{ .Version }}-amd64
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-amd64
ids:
- netbird-server
goarch: amd64
use: buildx
dockerfile: combined/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird-server:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm64v8
ids:
- netbird-server
goarch: arm64
use: buildx
dockerfile: combined/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/netbird-server:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm
ids:
- netbird-server
goarch: arm
goarm: 6
use: buildx
dockerfile: combined/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/reverse-proxy:{{ .Version }}-amd64
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-amd64
ids:
- netbird-proxy
goarch: amd64
use: buildx
dockerfile: proxy/Dockerfile
build_flag_templates:
- "--platform=linux/amd64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/reverse-proxy:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm64v8
ids:
- netbird-proxy
goarch: arm64
use: buildx
dockerfile: proxy/Dockerfile
build_flag_templates:
- "--platform=linux/arm64"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
- image_templates:
- netbirdio/reverse-proxy:{{ .Version }}-arm
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm
ids:
- netbird-proxy
goarch: arm
goarm: 6
use: buildx
dockerfile: proxy/Dockerfile
build_flag_templates:
- "--platform=linux/arm"
- "--label=org.opencontainers.image.created={{.Date}}"
- "--label=org.opencontainers.image.title={{.ProjectName}}"
- "--label=org.opencontainers.image.version={{.Version}}"
- "--label=org.opencontainers.image.revision={{.FullCommit}}"
- "--label=org.opencontainers.image.source=https://github.com/netbirdio/{{.ProjectName}}"
- "--label=maintainer=dev@netbird.io"
docker_manifests:
- name_template: netbirdio/netbird:{{ .Version }}
image_templates:
- netbirdio/netbird:{{ .Version }}-arm64v8
- netbirdio/netbird:{{ .Version }}-arm
- netbirdio/netbird:{{ .Version }}-amd64
- name_template: netbirdio/netbird:latest
image_templates:
- netbirdio/netbird:{{ .Version }}-arm64v8
- netbirdio/netbird:{{ .Version }}-arm
- netbirdio/netbird:{{ .Version }}-amd64
- name_template: netbirdio/netbird:{{ .Version }}-rootless
image_templates:
- netbirdio/netbird:{{ .Version }}-rootless-arm64v8
- netbirdio/netbird:{{ .Version }}-rootless-arm
- netbirdio/netbird:{{ .Version }}-rootless-amd64
- name_template: netbirdio/netbird:rootless-latest
image_templates:
- netbirdio/netbird:{{ .Version }}-rootless-arm64v8
- netbirdio/netbird:{{ .Version }}-rootless-arm
- netbirdio/netbird:{{ .Version }}-rootless-amd64
- name_template: netbirdio/relay:{{ .Version }}
image_templates:
- netbirdio/relay:{{ .Version }}-arm64v8
- netbirdio/relay:{{ .Version }}-arm
- netbirdio/relay:{{ .Version }}-amd64
- name_template: netbirdio/relay:latest
image_templates:
- netbirdio/relay:{{ .Version }}-arm64v8
- netbirdio/relay:{{ .Version }}-arm
- netbirdio/relay:{{ .Version }}-amd64
- name_template: netbirdio/signal:{{ .Version }}
image_templates:
- netbirdio/signal:{{ .Version }}-arm64v8
- netbirdio/signal:{{ .Version }}-arm
- netbirdio/signal:{{ .Version }}-amd64
- name_template: netbirdio/signal:latest
image_templates:
- netbirdio/signal:{{ .Version }}-arm64v8
- netbirdio/signal:{{ .Version }}-arm
- netbirdio/signal:{{ .Version }}-amd64
- name_template: netbirdio/management:{{ .Version }}
image_templates:
- netbirdio/management:{{ .Version }}-arm64v8
- netbirdio/management:{{ .Version }}-arm
- netbirdio/management:{{ .Version }}-amd64
- name_template: netbirdio/management:latest
image_templates:
- netbirdio/management:{{ .Version }}-arm64v8
- netbirdio/management:{{ .Version }}-arm
- netbirdio/management:{{ .Version }}-amd64
- name_template: netbirdio/management:debug-latest
image_templates:
- netbirdio/management:{{ .Version }}-debug-arm64v8
- netbirdio/management:{{ .Version }}-debug-arm
- netbirdio/management:{{ .Version }}-debug-amd64
- name_template: netbirdio/upload:{{ .Version }}
image_templates:
- netbirdio/upload:{{ .Version }}-arm64v8
- netbirdio/upload:{{ .Version }}-arm
- netbirdio/upload:{{ .Version }}-amd64
- name_template: netbirdio/upload:latest
image_templates:
- netbirdio/upload:{{ .Version }}-arm64v8
- netbirdio/upload:{{ .Version }}-arm
- netbirdio/upload:{{ .Version }}-amd64
- name_template: netbirdio/netbird-server:{{ .Version }}
image_templates:
- netbirdio/netbird-server:{{ .Version }}-arm64v8
- netbirdio/netbird-server:{{ .Version }}-arm
- netbirdio/netbird-server:{{ .Version }}-amd64
- name_template: netbirdio/netbird-server:latest
image_templates:
- netbirdio/netbird-server:{{ .Version }}-arm64v8
- netbirdio/netbird-server:{{ .Version }}-arm
- netbirdio/netbird-server:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/netbird:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/netbird:latest
image_templates:
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/netbird:{{ .Version }}-rootless
image_templates:
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-amd64
- name_template: ghcr.io/netbirdio/netbird:rootless-latest
image_templates:
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm64v8
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-arm
- ghcr.io/netbirdio/netbird:{{ .Version }}-rootless-amd64
- name_template: ghcr.io/netbirdio/relay:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/relay:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/relay:{{ .Version }}-arm
- ghcr.io/netbirdio/relay:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/relay:latest
image_templates:
- ghcr.io/netbirdio/relay:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/relay:{{ .Version }}-arm
- ghcr.io/netbirdio/relay:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/signal:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/signal:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/signal:{{ .Version }}-arm
- ghcr.io/netbirdio/signal:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/signal:latest
image_templates:
- ghcr.io/netbirdio/signal:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/signal:{{ .Version }}-arm
- ghcr.io/netbirdio/signal:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/management:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/management:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/management:{{ .Version }}-arm
- ghcr.io/netbirdio/management:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/management:latest
image_templates:
- ghcr.io/netbirdio/management:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/management:{{ .Version }}-arm
- ghcr.io/netbirdio/management:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/management:debug-latest
image_templates:
- ghcr.io/netbirdio/management:{{ .Version }}-debug-arm64v8
- ghcr.io/netbirdio/management:{{ .Version }}-debug-arm
- ghcr.io/netbirdio/management:{{ .Version }}-debug-amd64
- name_template: ghcr.io/netbirdio/upload:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/upload:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/upload:{{ .Version }}-arm
- ghcr.io/netbirdio/upload:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/upload:latest
image_templates:
- ghcr.io/netbirdio/upload:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/upload:{{ .Version }}-arm
- ghcr.io/netbirdio/upload:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/netbird-server:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/netbird-server:latest
image_templates:
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-arm
- ghcr.io/netbirdio/netbird-server:{{ .Version }}-amd64
- name_template: netbirdio/reverse-proxy:{{ .Version }}
image_templates:
- netbirdio/reverse-proxy:{{ .Version }}-arm64v8
- netbirdio/reverse-proxy:{{ .Version }}-arm
- netbirdio/reverse-proxy:{{ .Version }}-amd64
- name_template: netbirdio/reverse-proxy:latest
image_templates:
- netbirdio/reverse-proxy:{{ .Version }}-arm64v8
- netbirdio/reverse-proxy:{{ .Version }}-arm
- netbirdio/reverse-proxy:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/reverse-proxy:{{ .Version }}
image_templates:
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-amd64
- name_template: ghcr.io/netbirdio/reverse-proxy:latest
image_templates:
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm64v8
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-arm
- ghcr.io/netbirdio/reverse-proxy:{{ .Version }}-amd64
brews:
- ids:
- default
skip_upload: "{{ .Env.SKIP_PUBLISH }}"
repository:
owner: netbirdio
name: homebrew-tap
@@ -440,7 +902,6 @@ brews:
uploads:
- name: debian
skip: "{{ .Env.SKIP_PUBLISH }}"
ids:
- netbird_deb
mode: archive
@@ -449,7 +910,6 @@ uploads:
method: PUT
- name: yum
skip: "{{ .Env.SKIP_PUBLISH }}"
ids:
- netbird_rpm
mode: archive
@@ -462,13 +922,9 @@ checksum:
- glob: ./infrastructure_files/getting-started-with-zitadel.sh
- glob: ./release_files/install.sh
- glob: ./infrastructure_files/getting-started.sh
- glob: ./infrastructure_files/getting-started-enterprise.sh
- glob: ./infrastructure_files/migrate-to-enterprise.sh
release:
extra_files:
- glob: ./infrastructure_files/getting-started-with-zitadel.sh
- glob: ./release_files/install.sh
- glob: ./infrastructure_files/getting-started.sh
- glob: ./infrastructure_files/getting-started-enterprise.sh
- glob: ./infrastructure_files/migrate-to-enterprise.sh

View File

@@ -1,6 +1,5 @@
version: 2
env:
- SKIP_PUBLISH={{ if index .Env "SKIP_PUBLISH" }}{{ .Env.SKIP_PUBLISH }}{{ else }}true{{ end }}
project_name: netbird-ui
builds:
- id: netbird-ui
@@ -102,7 +101,6 @@ nfpms:
uploads:
- name: debian
skip: "{{ .Env.SKIP_PUBLISH }}"
ids:
- netbird_ui_deb
mode: archive
@@ -111,7 +109,6 @@ uploads:
method: PUT
- name: yum
skip: "{{ .Env.SKIP_PUBLISH }}"
ids:
- netbird_ui_rpm
mode: archive

View File

@@ -4,7 +4,7 @@
# sudo podman build -t localhost/netbird:latest -f client/Dockerfile --ignorefile .dockerignore-client .
# sudo podman run --rm -it --cap-add={BPF,NET_ADMIN,NET_RAW} localhost/netbird:latest
FROM alpine:3.24
FROM alpine:3.23.3
# iproute2: busybox doesn't display ip rules properly
RUN apk add --no-cache \
bash \
@@ -21,7 +21,7 @@ ENV \
NB_ENTRYPOINT_SERVICE_TIMEOUT="30"
ENTRYPOINT [ "/usr/local/bin/netbird-entrypoint.sh" ]
ARG TARGETPLATFORM
ARG NETBIRD_BINARY=$TARGETPLATFORM/netbird
ARG NETBIRD_BINARY=netbird
COPY client/netbird-entrypoint.sh /usr/local/bin/netbird-entrypoint.sh
COPY "${NETBIRD_BINARY}" /usr/local/bin/netbird

View File

@@ -4,7 +4,7 @@
# podman build -t localhost/netbird:latest -f client/Dockerfile --ignorefile .dockerignore-client .
# podman run --rm -it --cap-add={BPF,NET_ADMIN,NET_RAW} localhost/netbird:latest
FROM alpine:3.24
FROM alpine:3.22.0
RUN apk add --no-cache \
bash \
@@ -27,7 +27,7 @@ ENV \
NB_ENTRYPOINT_SERVICE_TIMEOUT="30"
ENTRYPOINT [ "/usr/local/bin/netbird-entrypoint.sh" ]
ARG TARGETPLATFORM
ARG NETBIRD_BINARY=$TARGETPLATFORM/netbird
ARG NETBIRD_BINARY=netbird
COPY client/netbird-entrypoint.sh /usr/local/bin/netbird-entrypoint.sh
COPY "${NETBIRD_BINARY}" /usr/local/bin/netbird

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
log "github.com/sirupsen/logrus"
@@ -23,7 +24,6 @@ const (
// Profile represents a profile for gomobile
type Profile struct {
ID string
Name string
IsActive bool
}
@@ -53,10 +53,10 @@ func (p *ProfileArray) Get(i int) *Profile {
├── state.json ← Default profile state
├── active_profile.json ← Active profile tracker (JSON with Name + Username)
└── profiles/ ← Subdirectory for non-default profiles
├── work.json ← Legacy work profile config
├── work.state.json ← Legacy work profile state
├── 4c5f5c8198c3989cffb5b5394f5a7ae0.json ← ID profile config
── 4c5f5c8198c3989cffb5b5394f5a7ae0.state.json ← ID profile state
├── work.json ← Work profile config
├── work.state.json ← Work profile state
├── personal.json ← Personal profile config
── personal.state.json ← Personal profile state
*/
// ProfileManager manages profiles for Android
@@ -99,7 +99,6 @@ func (pm *ProfileManager) ListProfiles() (*ProfileArray, error) {
var profiles []*Profile
for _, p := range internalProfiles {
profiles = append(profiles, &Profile{
ID: p.ID.String(),
Name: p.Name,
IsActive: p.IsActive,
})
@@ -109,65 +108,55 @@ func (pm *ProfileManager) ListProfiles() (*ProfileArray, error) {
}
// GetActiveProfile returns the currently active profile name
func (pm *ProfileManager) GetActiveProfile() (*Profile, error) {
func (pm *ProfileManager) GetActiveProfile() (string, error) {
// Use ServiceManager to stay consistent with ListProfiles
// ServiceManager uses active_profile.json
activeState, err := pm.serviceMgr.GetActiveProfileState()
if err != nil {
return nil, fmt.Errorf("failed to get active profile: %w", err)
return "", fmt.Errorf("failed to get active profile: %w", err)
}
// ActiveProfileState only stores the ID (and username), not the display
// name. Resolve the ID to the full profile so callers get the real Name.
prof, err := pm.serviceMgr.ResolveProfile(activeState.ID.String(), androidUsername)
if err != nil {
return nil, fmt.Errorf("failed to resolve active profile %q: %w", activeState.ID, err)
}
return &Profile{ID: prof.ID.String(), Name: prof.Name, IsActive: true}, nil
return activeState.Name, nil
}
// SwitchProfile switches to a different profile
func (pm *ProfileManager) SwitchProfile(id string) error {
func (pm *ProfileManager) SwitchProfile(profileName string) error {
// Use ServiceManager to stay consistent with ListProfiles
// ServiceManager uses active_profile.json
err := pm.serviceMgr.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: profilemanager.ID(id),
Name: profileName,
Username: androidUsername,
})
if err != nil {
return fmt.Errorf("failed to switch profile: %w", err)
}
log.Infof("switched to profile: %s", id)
log.Infof("switched to profile: %s", profileName)
return nil
}
// AddProfile creates a new profile
func (pm *ProfileManager) AddProfile(profileName string) error {
// Use ServiceManager (creates profile in profiles/ directory)
profile, err := pm.serviceMgr.AddProfile(profileName, androidUsername)
if err != nil {
if err := pm.serviceMgr.AddProfile(profileName, androidUsername); err != nil {
return fmt.Errorf("failed to add profile: %w", err)
}
log.Infof("created new profile: %s", profile.ID)
log.Infof("created new profile: %s", profileName)
return nil
}
// LogoutProfile logs out from a profile (clears authentication)
func (pm *ProfileManager) LogoutProfile(id string) error {
configPath, err := pm.getProfileConfigPath(id)
func (pm *ProfileManager) LogoutProfile(profileName string) error {
profileName = sanitizeProfileName(profileName)
configPath, err := pm.getProfileConfigPath(profileName)
if err != nil {
return err
}
if !profilemanager.IsValidProfileFilenameStem(profilemanager.ID(id)) {
return fmt.Errorf("id '%s' is not valid", id)
}
// Check if profile exists
if _, err := os.Stat(configPath); os.IsNotExist(err) {
return fmt.Errorf("profile '%s' does not exist", id)
return fmt.Errorf("profile '%s' does not exist", profileName)
}
// Read current config using internal profilemanager
@@ -185,57 +174,53 @@ func (pm *ProfileManager) LogoutProfile(id string) error {
return fmt.Errorf("failed to save config: %w", err)
}
log.Infof("logged out from profile: %s", id)
log.Infof("logged out from profile: %s", profileName)
return nil
}
// RemoveProfile deletes a profile
func (pm *ProfileManager) RemoveProfile(id string) error {
func (pm *ProfileManager) RemoveProfile(profileName string) error {
// Use ServiceManager (removes profile from profiles/ directory)
if err := pm.serviceMgr.RemoveProfile(profilemanager.ID(id), androidUsername); err != nil {
if err := pm.serviceMgr.RemoveProfile(profileName, androidUsername); err != nil {
return fmt.Errorf("failed to remove profile: %w", err)
}
log.Infof("removed profile: %s", id)
log.Infof("removed profile: %s", profileName)
return nil
}
// getProfileConfigPath returns the config file path for a profile
// This is needed for Android-specific path handling (netbird.cfg for default profile)
func (pm *ProfileManager) getProfileConfigPath(id string) (string, error) {
if !profilemanager.IsValidProfileFilenameStem(profilemanager.ID(id)) {
return "", fmt.Errorf("id %q is not valid", id)
}
if id == profilemanager.DefaultProfileName {
func (pm *ProfileManager) getProfileConfigPath(profileName string) (string, error) {
if profileName == "" || profileName == profilemanager.DefaultProfileName {
// Android uses netbird.cfg for default profile instead of default.json
// Default profile is stored in root configDir, not in profiles/
return filepath.Join(pm.configDir, defaultConfigFilename), nil
}
// Non-default profiles are stored in profiles subdirectory
// This matches the Java Preferences.java expectation
profileName = sanitizeProfileName(profileName)
profilesDir := filepath.Join(pm.configDir, profilesSubdir)
return filepath.Join(profilesDir, id+".json"), nil
return filepath.Join(profilesDir, profileName+".json"), nil
}
// GetConfigPath returns the config file path for a given profile id
// GetConfigPath returns the config file path for a given profile
// Java should call this instead of constructing paths with Preferences.configFile()
func (pm *ProfileManager) GetConfigPath(id string) (string, error) {
return pm.getProfileConfigPath(id)
func (pm *ProfileManager) GetConfigPath(profileName string) (string, error) {
return pm.getProfileConfigPath(profileName)
}
// GetStateFilePath returns the state file path for a given profile
// Java should call this instead of constructing paths with Preferences.stateFile()
func (pm *ProfileManager) GetStateFilePath(id string) (string, error) {
if id == "" || id == profilemanager.DefaultProfileName {
func (pm *ProfileManager) GetStateFilePath(profileName string) (string, error) {
if profileName == "" || profileName == profilemanager.DefaultProfileName {
return filepath.Join(pm.configDir, "state.json"), nil
}
if !profilemanager.IsValidProfileFilenameStem(profilemanager.ID(id)) {
return "", fmt.Errorf("id %q is not valid", id)
}
profileName = sanitizeProfileName(profileName)
profilesDir := filepath.Join(pm.configDir, profilesSubdir)
return filepath.Join(profilesDir, id+".state.json"), nil
return filepath.Join(profilesDir, profileName+".state.json"), nil
}
// GetActiveConfigPath returns the config file path for the currently active profile
@@ -245,7 +230,7 @@ func (pm *ProfileManager) GetActiveConfigPath() (string, error) {
if err != nil {
return "", fmt.Errorf("failed to get active profile: %w", err)
}
return pm.GetConfigPath(activeProfile.ID)
return pm.GetConfigPath(activeProfile)
}
// GetActiveStateFilePath returns the state file path for the currently active profile
@@ -255,5 +240,18 @@ func (pm *ProfileManager) GetActiveStateFilePath() (string, error) {
if err != nil {
return "", fmt.Errorf("failed to get active profile: %w", err)
}
return pm.GetStateFilePath(activeProfile.ID)
return pm.GetStateFilePath(activeProfile)
}
// sanitizeProfileName removes invalid characters from profile name
func sanitizeProfileName(name string) string {
// Keep only alphanumeric, underscore, and hyphen
var result strings.Builder
for _, r := range name {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
(r >= '0' && r <= '9') || r == '_' || r == '-' {
result.WriteRune(r)
}
}
return result.String()
}

View File

@@ -130,7 +130,7 @@ func debugConfigDump(cmd *cobra.Command, _ []string) error {
client := proto.NewDaemonServiceClient(conn)
resp, err := client.GetConfig(cmd.Context(), &proto.GetConfigRequest{
ProfileName: string(activeProf.ID),
ProfileName: activeProf.Name,
Username: currUser.Username,
})
if err != nil {

View File

@@ -96,19 +96,17 @@ func doDaemonLogin(ctx context.Context, cmd *cobra.Command, providedSetupKey str
dnsLabelsReq = dnsLabelsValidated.ToSafeStringList()
}
handle := activeProf.ID.String()
loginRequest := proto.LoginRequest{
SetupKey: providedSetupKey,
ManagementUrl: managementURL,
IsUnixDesktopClient: isUnixRunningDesktop(),
Hostname: hostName,
DnsLabels: dnsLabelsReq,
ProfileName: &handle,
ProfileName: &activeProf.Name,
Username: &username,
}
profileState, err := pm.GetProfileState(activeProf.ID)
profileState, err := pm.GetProfileState(activeProf.Name)
if err != nil {
log.Debugf("failed to get profile state for login hint: %v", err)
} else if profileState.Email != "" {
@@ -172,13 +170,14 @@ func getActiveProfile(ctx context.Context, pm *profilemanager.ProfileManager, pr
return activeProf, nil
}
func switchProfileOnDaemon(ctx context.Context, pm *profilemanager.ProfileManager, handle string, username string) error {
resolvedID, err := switchProfile(ctx, handle, username)
func switchProfileOnDaemon(ctx context.Context, pm *profilemanager.ProfileManager, profileName string, username string) error {
err := switchProfile(context.Background(), profileName, username)
if err != nil {
return fmt.Errorf("switch profile on daemon: %v", err)
}
if err := pm.SwitchProfile(resolvedID); err != nil {
err = pm.SwitchProfile(profileName)
if err != nil {
return fmt.Errorf("switch profile: %v", err)
}
@@ -206,15 +205,11 @@ func switchProfileOnDaemon(ctx context.Context, pm *profilemanager.ProfileManage
return nil
}
// switchProfile asks the daemon to switch to the profile identified by
// handle (a name, ID, or unique ID prefix). Returns the resolved profile
// ID so the caller can update the local active-profile state without
// re-resolving the handle.
func switchProfile(ctx context.Context, handle string, username string) (profilemanager.ID, error) {
func switchProfile(ctx context.Context, profileName string, username string) error {
conn, err := DialClientGRPCServer(ctx, daemonAddr)
if err != nil {
//nolint
return "", fmt.Errorf("failed to connect to daemon error: %v\n"+
return fmt.Errorf("failed to connect to daemon error: %v\n"+
"If the daemon is not running please run: "+
"\nnetbird service install \nnetbird service start\n", err)
}
@@ -222,15 +217,15 @@ func switchProfile(ctx context.Context, handle string, username string) (profile
client := proto.NewDaemonServiceClient(conn)
resp, err := client.SwitchProfile(ctx, &proto.SwitchProfileRequest{
ProfileName: &handle,
_, err = client.SwitchProfile(ctx, &proto.SwitchProfileRequest{
ProfileName: &profileName,
Username: &username,
})
if err != nil {
return "", fmt.Errorf("switch profile failed: %w", err)
return fmt.Errorf("switch profile failed: %v", err)
}
return profilemanager.ID(resp.Id), nil
return nil
}
func doForegroundLogin(ctx context.Context, cmd *cobra.Command, setupKey string, activeProf *profilemanager.Profile) error {
@@ -254,7 +249,7 @@ func doForegroundLogin(ctx context.Context, cmd *cobra.Command, setupKey string,
return fmt.Errorf("read config file %s: %v", configFilePath, err)
}
err = foregroundLogin(ctx, cmd, config, setupKey, activeProf.ID)
err = foregroundLogin(ctx, cmd, config, setupKey, activeProf.Name)
if err != nil {
return fmt.Errorf("foreground login failed: %v", err)
}
@@ -282,7 +277,7 @@ func handleSSOLogin(ctx context.Context, cmd *cobra.Command, loginResp *proto.Lo
return nil
}
func foregroundLogin(ctx context.Context, cmd *cobra.Command, config *profilemanager.Config, setupKey string, profileID profilemanager.ID) error {
func foregroundLogin(ctx context.Context, cmd *cobra.Command, config *profilemanager.Config, setupKey, profileName string) error {
authClient, err := auth.NewAuth(ctx, config.PrivateKey, config.ManagementURL, config)
if err != nil {
return fmt.Errorf("failed to create auth client: %v", err)
@@ -296,7 +291,7 @@ func foregroundLogin(ctx context.Context, cmd *cobra.Command, config *profileman
jwtToken := ""
if setupKey == "" && needsLogin {
tokenInfo, err := foregroundGetTokenInfo(ctx, cmd, config, profileID)
tokenInfo, err := foregroundGetTokenInfo(ctx, cmd, config, profileName)
if err != nil {
return fmt.Errorf("interactive sso login failed: %v", err)
}
@@ -311,10 +306,10 @@ func foregroundLogin(ctx context.Context, cmd *cobra.Command, config *profileman
return nil
}
func foregroundGetTokenInfo(ctx context.Context, cmd *cobra.Command, config *profilemanager.Config, profileID profilemanager.ID) (*auth.TokenInfo, error) {
func foregroundGetTokenInfo(ctx context.Context, cmd *cobra.Command, config *profilemanager.Config, profileName string) (*auth.TokenInfo, error) {
hint := ""
pm := profilemanager.NewProfileManager()
profileState, err := pm.GetProfileState(profileID)
profileState, err := pm.GetProfileState(profileName)
if err != nil {
log.Debugf("failed to get profile state for login hint: %v", err)
} else if profileState.Email != "" {

View File

@@ -27,7 +27,7 @@ func TestLogin(t *testing.T) {
profilemanager.ActiveProfileStatePath = tempDir + "/active_profile.json"
sm := profilemanager.ServiceManager{}
err = sm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: "default",
Name: "default",
Username: currUser.Username,
})
if err != nil {

View File

@@ -2,16 +2,11 @@ package cmd
import (
"context"
"errors"
"fmt"
"os/user"
"strings"
"text/tabwriter"
"time"
"github.com/spf13/cobra"
"google.golang.org/grpc/codes"
gstatus "google.golang.org/grpc/status"
"github.com/netbirdio/netbird/client/internal"
"github.com/netbirdio/netbird/client/internal/profilemanager"
@@ -19,8 +14,6 @@ import (
"github.com/netbirdio/netbird/util"
)
var profileListShowID bool
var profileCmd = &cobra.Command{
Use: "profile",
Short: "Manage NetBird client profiles",
@@ -38,40 +31,27 @@ var profileListCmd = &cobra.Command{
var profileAddCmd = &cobra.Command{
Use: "add <profile_name>",
Short: "Add a new profile",
Long: `Add a new profile. Profile name is free-form, a unique ID is generated for the on-disk config file.`,
Long: `Add a new profile to the NetBird client. The profile name must be unique.`,
Args: cobra.ExactArgs(1),
RunE: addProfileFunc,
}
var profileRenameCmd = &cobra.Command{
Use: "rename <profile> <new_profile_name>",
Short: "Renames an existing profile",
Long: `Renames an existing profile (by a name, ID, or unique ID prefix). Profile name is free-form.`,
Args: cobra.ExactArgs(2),
RunE: renameProfileFunc,
}
var profileRemoveCmd = &cobra.Command{
Use: "remove <profile>",
Short: "Remove a profile",
Long: `Remove a profile by name, ID, or unique ID prefix.`,
Aliases: []string{"rm"},
Args: cobra.ExactArgs(1),
RunE: removeProfileFunc,
Use: "remove <profile_name>",
Short: "Remove a profile",
Long: `Remove a profile from the NetBird client. The profile must not be inactive.`,
Args: cobra.ExactArgs(1),
RunE: removeProfileFunc,
}
var profileSelectCmd = &cobra.Command{
Use: "select <profile>",
Use: "select <profile_name>",
Short: "Select a profile",
Long: `Make the specified profile active. Accepts a name, ID, or unique ID prefix.`,
Long: `Make the specified profile active. This will switch the client to use the selected profile's configuration.`,
Args: cobra.ExactArgs(1),
RunE: selectProfileFunc,
}
func init() {
profileListCmd.Flags().BoolVar(&profileListShowID, "show-id", false, "show the profile ID column")
}
func setupCmd(cmd *cobra.Command) error {
SetFlagsFromEnvVars(rootCmd)
SetFlagsFromEnvVars(cmd)
@@ -85,7 +65,6 @@ func setupCmd(cmd *cobra.Command) error {
return nil
}
func listProfilesFunc(cmd *cobra.Command, _ []string) error {
if err := setupCmd(cmd); err != nil {
return err
@@ -104,33 +83,25 @@ func listProfilesFunc(cmd *cobra.Command, _ []string) error {
daemonClient := proto.NewDaemonServiceClient(conn)
resp, err := daemonClient.ListProfiles(cmd.Context(), &proto.ListProfilesRequest{
profiles, err := daemonClient.ListProfiles(cmd.Context(), &proto.ListProfilesRequest{
Username: currUser.Username,
})
if err != nil {
return err
}
tw := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 2, ' ', 0)
if profileListShowID {
fmt.Fprintln(tw, "ID\tNAME\tACTIVE")
} else {
fmt.Fprintln(tw, "NAME\tACTIVE")
}
for _, profile := range resp.Profiles {
marker := ""
// list profiles, add a tick if the profile is active
cmd.Println("Found", len(profiles.Profiles), "profiles:")
for _, profile := range profiles.Profiles {
// use a cross to indicate the passive profiles
activeMarker := "✗"
if profile.IsActive {
marker = "✓"
}
name := profilemanager.StripCtrlChars(profile.Name)
id := profilemanager.ID(profile.Id)
if profileListShowID {
fmt.Fprintf(tw, "%s\t%s\t%s\n", id.ShortID(), name, marker)
} else {
fmt.Fprintf(tw, "%s\t%s\n", name, marker)
activeMarker = "✓"
}
cmd.Println(activeMarker, profile.Name)
}
return tw.Flush()
return nil
}
func addProfileFunc(cmd *cobra.Command, args []string) error {
@@ -138,90 +109,33 @@ func addProfileFunc(cmd *cobra.Command, args []string) error {
return err
}
currUser, err := user.Current()
if err != nil {
return fmt.Errorf("get current user: %w", err)
}
conn, err := DialClientGRPCServer(cmd.Context(), daemonAddr)
if err != nil {
return fmt.Errorf("connect to service CLI interface: %w", err)
}
defer conn.Close()
currUser, err := user.Current()
if err != nil {
return fmt.Errorf("get current user: %w", err)
}
daemonClient := proto.NewDaemonServiceClient(conn)
profileName := args[0]
id, err := addProfileOnDaemon(cmd.Context(), daemonClient, profileName, currUser.Username)
if err != nil {
return err
}
dupCount, _ := countProfilesWithName(cmd.Context(), daemonClient, currUser.Username, profileName)
if dupCount > 1 {
cmd.Printf("Warning: %d other profile(s) already use the name %q.\n", dupCount-1, profileName)
cmd.Println("Use `netbird profile list --show-id` to disambiguate later.")
}
cmd.Printf("Profile added: %s %s\n", id.ShortID(), profilemanager.StripCtrlChars(profileName))
return nil
}
func renameProfileFunc(cmd *cobra.Command, args []string) error {
if err := setupCmd(cmd); err != nil {
return err
}
conn, err := DialClientGRPCServer(cmd.Context(), daemonAddr)
if err != nil {
return fmt.Errorf("connect to service CLI interface: %w", err)
}
defer conn.Close()
currUser, err := user.Current()
if err != nil {
return fmt.Errorf("get current user: %w", err)
}
daemonClient := proto.NewDaemonServiceClient(conn)
handle := args[0]
newProfilename := args[1]
resp, err := daemonClient.RenameProfile(cmd.Context(), &proto.RenameProfileRequest{
Handle: handle,
Username: currUser.Username,
NewProfileName: newProfilename,
_, err = daemonClient.AddProfile(cmd.Context(), &proto.AddProfileRequest{
ProfileName: profileName,
Username: currUser.Username,
})
if err != nil {
return wrapAmbiguityError(err, handle)
return err
}
dupCount, _ := countProfilesWithName(cmd.Context(), daemonClient, currUser.Username, newProfilename)
if dupCount > 1 {
cmd.Printf("Warning: %d other profile(s) already use the name %q.\n", dupCount-1, newProfilename)
cmd.Println("Use `netbird profile list --show-id` to disambiguate later.")
}
cmd.Printf("Profile renamed from %s to %s\n", profilemanager.StripCtrlChars(resp.OldProfileName), profilemanager.StripCtrlChars(newProfilename))
cmd.Println("Profile added successfully:", profileName)
return nil
}
func countProfilesWithName(ctx context.Context, c proto.DaemonServiceClient, username, name string) (int, error) {
resp, err := c.ListProfiles(ctx, &proto.ListProfilesRequest{Username: username})
if err != nil {
return 0, err
}
n := 0
for _, p := range resp.Profiles {
if p.Name == name {
n++
}
}
return n, nil
}
func removeProfileFunc(cmd *cobra.Command, args []string) error {
if err := setupCmd(cmd); err != nil {
return err
@@ -239,17 +153,18 @@ func removeProfileFunc(cmd *cobra.Command, args []string) error {
}
daemonClient := proto.NewDaemonServiceClient(conn)
handle := args[0]
resp, err := daemonClient.RemoveProfile(cmd.Context(), &proto.RemoveProfileRequest{
ProfileName: handle,
profileName := args[0]
_, err = daemonClient.RemoveProfile(cmd.Context(), &proto.RemoveProfileRequest{
ProfileName: profileName,
Username: currUser.Username,
})
if err != nil {
return wrapAmbiguityError(err, handle)
return err
}
cmd.Printf("Profile removed: %s\n", resp.Id)
cmd.Println("Profile removed successfully:", profileName)
return nil
}
@@ -259,7 +174,7 @@ func selectProfileFunc(cmd *cobra.Command, args []string) error {
}
profileManager := profilemanager.NewProfileManager()
handle := args[0]
profileName := args[0]
currUser, err := user.Current()
if err != nil {
@@ -276,15 +191,32 @@ func selectProfileFunc(cmd *cobra.Command, args []string) error {
daemonClient := proto.NewDaemonServiceClient(conn)
switchResp, err := daemonClient.SwitchProfile(ctx, &proto.SwitchProfileRequest{
ProfileName: &handle,
Username: &currUser.Username,
profiles, err := daemonClient.ListProfiles(ctx, &proto.ListProfilesRequest{
Username: currUser.Username,
})
if err != nil {
return wrapAmbiguityError(err, handle)
return fmt.Errorf("list profiles: %w", err)
}
if err := profileManager.SwitchProfile(profilemanager.ID(switchResp.Id)); err != nil {
var profileExists bool
for _, profile := range profiles.Profiles {
if profile.Name == profileName {
profileExists = true
break
}
}
if !profileExists {
return fmt.Errorf("profile %s does not exist", profileName)
}
if err := switchProfile(cmd.Context(), profileName, currUser.Username); err != nil {
return err
}
err = profileManager.SwitchProfile(profileName)
if err != nil {
return err
}
@@ -299,46 +231,6 @@ func selectProfileFunc(cmd *cobra.Command, args []string) error {
}
}
id := profilemanager.ID(switchResp.Id)
cmd.Printf("Profile switched to: %s\n", id.ShortID())
cmd.Println("Profile switched successfully to:", profileName)
return nil
}
// wrapAmbiguityError turns the daemon's gRPC InvalidArgument errors
// (which carry the resolver's message verbatim) into CLI-friendly text
// that points the user at --show-id.
func wrapAmbiguityError(err error, handle string) error {
if err == nil {
return nil
}
st, ok := gstatus.FromError(err)
if !ok {
return err
}
switch st.Code() {
case codes.InvalidArgument:
msg := st.Message()
if strings.Contains(msg, "ambiguous") {
return errors.New(msg + "\nRun `netbird profile list --show-id` to see IDs, then select by ID prefix:\n netbird profile select|remove <id-prefix>")
}
case codes.NotFound:
return fmt.Errorf("profile %q not found", handle)
}
return err
}
// addProfileOnDaemon issues the AddProfile RPC on an existing daemon client
// and returns the new profile's ID. It is the single entry point for profile
// creation, shared by `netbird profile add` and the `netbird up --profile
// <name>` auto-create path.
func addProfileOnDaemon(ctx context.Context, client proto.DaemonServiceClient, profileName, username string) (profilemanager.ID, error) {
resp, err := client.AddProfile(ctx, &proto.AddProfileRequest{
ProfileName: profileName,
Username: username,
})
if err != nil {
return "", fmt.Errorf("add profile failed: %w", err)
}
return profilemanager.ID(resp.Id), nil
}

View File

@@ -190,7 +190,6 @@ func init() {
// profile commands
profileCmd.AddCommand(profileListCmd)
profileCmd.AddCommand(profileAddCmd)
profileCmd.AddCommand(profileRenameCmd)
profileCmd.AddCommand(profileRemoveCmd)
profileCmd.AddCommand(profileSelectCmd)

View File

@@ -11,6 +11,7 @@ import (
"google.golang.org/grpc/status"
"github.com/netbirdio/netbird/client/internal"
"github.com/netbirdio/netbird/client/internal/profilemanager"
"github.com/netbirdio/netbird/client/proto"
nbstatus "github.com/netbirdio/netbird/client/status"
"github.com/netbirdio/netbird/util"
@@ -110,10 +111,11 @@ func statusFunc(cmd *cobra.Command, args []string) error {
return nil
}
// Resolve the active profile's display name via the daemon, which runs
// as root and can read the per-user profile files. The local profile
// manager only knows the active profile ID, not its display name.
profName := getActiveProfileName(ctx)
pm := profilemanager.NewProfileManager()
var profName string
if activeProf, err := pm.GetActiveProfile(); err == nil {
profName = activeProf.Name
}
var outputInformationHolder = nbstatus.ConvertToStatusOutputOverview(resp.GetFullStatus(), nbstatus.ConvertOptions{
Anonymize: anonymizeFlag,
@@ -165,25 +167,6 @@ func getStatus(ctx context.Context, fullPeerStatus bool, shouldRunProbes bool) (
return resp, nil
}
// getActiveProfileName asks the daemon for the active profile's display
// name. The daemon runs as root and can read the per-user profile files to
// resolve the ID to its human-readable name. Returns an empty string on any
// error so status output degrades gracefully.
func getActiveProfileName(ctx context.Context) string {
conn, err := DialClientGRPCServer(ctx, daemonAddr)
if err != nil {
return ""
}
defer conn.Close()
resp, err := proto.NewDaemonServiceClient(conn).GetActiveProfile(ctx, &proto.GetActiveProfileRequest{})
if err != nil {
return ""
}
return resp.GetProfileName()
}
func parseFilters() error {
switch strings.ToLower(statusFilter) {
case "", "idle", "connecting", "connected":

View File

@@ -128,9 +128,16 @@ func upFunc(cmd *cobra.Command, args []string) error {
var profileSwitched bool
// switch profile if provided
if profileName != "" {
if err := switchOrCreateProfile(cmd.Context(), pm, profileName, username.Username); err != nil {
err = switchProfile(cmd.Context(), profileName, username.Username)
if err != nil {
return fmt.Errorf("switch profile: %v", err)
}
err = pm.SwitchProfile(profileName)
if err != nil {
return fmt.Errorf("switch profile: %v", err)
}
profileSwitched = true
}
@@ -145,52 +152,6 @@ func upFunc(cmd *cobra.Command, args []string) error {
return runInDaemonMode(ctx, cmd, pm, activeProf, profileSwitched)
}
// switchOrCreateProfile switches the active profile to the one identified by
// handle, creating it first when it does not exist yet. This restores the
// pre-0.73 behaviour where `netbird up --profile <name>` auto-creates a
// missing profile instead of failing.
func switchOrCreateProfile(ctx context.Context, pm *profilemanager.ProfileManager, handle, username string) error {
resolvedID, err := switchProfile(ctx, handle, username)
if err != nil {
st, ok := gstatus.FromError(err)
if !ok || st.Code() != codes.NotFound {
return err
}
// Don't fail immediately on a create error: a concurrent run may
// have created the profile between the NotFound above and this
// call, in which case the retried switch still succeeds. Only
// surface the create error if the switch also fails.
_, createErr := createProfile(ctx, handle, username)
if resolvedID, err = switchProfile(ctx, handle, username); err != nil {
if createErr != nil {
return fmt.Errorf("create profile: %w", createErr)
}
return err
}
}
if err := pm.SwitchProfile(resolvedID); err != nil {
return err
}
return nil
}
// createProfile dials the daemon and creates a new profile with the given
// display name, returning its generated ID. Use addProfileOnDaemon directly
// when a daemon client is already available to reuse the connection.
func createProfile(ctx context.Context, profileName, username string) (profilemanager.ID, error) {
conn, err := DialClientGRPCServer(ctx, daemonAddr)
if err != nil {
//nolint
return "", fmt.Errorf("failed to connect to daemon error: %v\n"+
"If the daemon is not running please run: "+
"\nnetbird service install \nnetbird service start\n", err)
}
defer conn.Close()
return addProfileOnDaemon(ctx, proto.NewDaemonServiceClient(conn), profileName, username)
}
func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *profilemanager.Profile) error {
// override the default profile filepath if provided
if configPath != "" {
@@ -229,7 +190,7 @@ func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *pr
_, _ = profilemanager.UpdateOldManagementURL(ctx, config, configFilePath)
err = foregroundLogin(ctx, cmd, config, providedSetupKey, activeProf.ID)
err = foregroundLogin(ctx, cmd, config, providedSetupKey, activeProf.Name)
if err != nil {
return fmt.Errorf("foreground login failed: %v", err)
}
@@ -300,10 +261,10 @@ func runInDaemonMode(ctx context.Context, cmd *cobra.Command, pm *profilemanager
}
// set the new config
req := setupSetConfigReq(customDNSAddressConverted, cmd, activeProf.ID.String(), username.Username)
req := setupSetConfigReq(customDNSAddressConverted, cmd, activeProf.Name, username.Username)
if _, err := client.SetConfig(ctx, req); err != nil {
if st, ok := gstatus.FromError(err); ok && st.Code() == codes.Unavailable {
log.Warnf("setConfig method is not available in the daemon: %s", st.Message())
log.Warnf("setConfig method is not available in the daemon")
} else {
return fmt.Errorf("call service setConfig method: %v", err)
}
@@ -328,11 +289,10 @@ func doDaemonUp(ctx context.Context, cmd *cobra.Command, client proto.DaemonServ
return fmt.Errorf("setup login request: %v", err)
}
profileID := activeProf.ID.String()
loginRequest.ProfileName = &profileID
loginRequest.ProfileName = &activeProf.Name
loginRequest.Username = &username
profileState, err := pm.GetProfileState(activeProf.ID)
profileState, err := pm.GetProfileState(activeProf.Name)
if err != nil {
log.Debugf("failed to get profile state for login hint: %v", err)
} else if profileState.Email != "" {
@@ -369,7 +329,7 @@ func doDaemonUp(ctx context.Context, cmd *cobra.Command, client proto.DaemonServ
}
if _, err := client.Up(ctx, &proto.UpRequest{
ProfileName: &profileID,
ProfileName: &activeProf.Name,
Username: &username,
}); err != nil {
return fmt.Errorf("call service up method: %v", err)

View File

@@ -29,14 +29,14 @@ func TestUpDaemon(t *testing.T) {
}
sm := profilemanager.ServiceManager{}
created, err := sm.AddProfile("test1", currUser.Username)
err = sm.AddProfile("test1", currUser.Username)
if err != nil {
t.Fatalf("failed to add profile: %v", err)
return
}
err = sm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: created.ID,
Name: "test1",
Username: currUser.Username,
})
if err != nil {

View File

@@ -279,11 +279,9 @@ func (c *Client) Start(startCtx context.Context) error {
select {
case <-startCtx.Done():
// ConnectClient.Stop now cancels its own run context and waits for the
// run loop to tear the engine down, so this cancel() is no longer
// required to break the deadlock and could be removed. It is kept as a
// defensive belt-and-suspenders: cancelling the parent context first
// guarantees the run loop is unblocked even if Stop's contract regresses.
// Cancel the client context before stopping: Engine.Start blocks on the
// signal stream while holding the engine mutex and only unblocks on
// cancellation. Stopping first would deadlock on that mutex.
cancel()
if stopErr := client.Stop(); stopErr != nil {
return fmt.Errorf("stop error after context done. Stop error: %w. Context done: %w", stopErr, startCtx.Err())

View File

@@ -11,7 +11,6 @@ import (
"runtime/debug"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cenkalti/backoff/v4"
@@ -55,10 +54,6 @@ var androidRunOverride func(c *ConnectClient, runningChan chan struct{}, logPath
type ConnectClient struct {
ctx context.Context
runCancel context.CancelFunc
runExited chan struct{}
runOnce sync.Once
runStarted atomic.Bool
config *profilemanager.Config
statusRecorder *peer.Status
@@ -75,14 +70,8 @@ func NewConnectClient(
config *profilemanager.Config,
statusRecorder *peer.Status,
) *ConnectClient {
// Derive the run context here so Stop owns the cancel that unblocks the run
// loop. runCancel is set once at construction, so Stop can call it without
// racing the run loop's startup. Callers therefore need not cancel before Stop.
runCtx, runCancel := context.WithCancel(ctx)
return &ConnectClient{
ctx: runCtx,
runCancel: runCancel,
runExited: make(chan struct{}),
ctx: ctx,
config: config,
statusRecorder: statusRecorder,
engineMutex: sync.Mutex{},
@@ -129,8 +118,6 @@ func (c *ConnectClient) RunOniOS(
networkChangeListener listener.NetworkChangeListener,
dnsManager dns.IosDnsManager,
stateFilePath string,
cacheDir string,
logFilePath string,
) error {
// Set GC percent to 5% to reduce memory usage as iOS only allows 50MB of memory for the extension.
debug.SetGCPercent(5)
@@ -140,17 +127,11 @@ func (c *ConnectClient) RunOniOS(
NetworkChangeListener: networkChangeListener,
DnsManager: dnsManager,
StateFilePath: stateFilePath,
TempDir: cacheDir,
}
return c.run(mobileDependency, nil, logFilePath)
return c.run(mobileDependency, nil, "")
}
func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan struct{}, logPath string) error {
// Mark the loop as started and signal exit on return so Stop can wait for
// the loop to finish (and skip the wait if the loop never ran).
c.runStarted.Store(true)
defer c.runOnce.Do(func() { close(c.runExited) })
defer func() {
if r := recover(); r != nil {
rec := c.statusRecorder
@@ -306,7 +287,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
log.Debug(err)
if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
state.Set(StatusNeedsLogin)
c.runCancel()
_ = c.Stop()
return backoff.Permanent(wrapErr(err)) // unrecoverable error
}
return wrapErr(err)
@@ -426,10 +407,14 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
c.engine = nil
c.engineMutex.Unlock()
log.Infof("ensuring wg interface is removed, Netbird engine context cancelled")
// todo: consider to remove this condition. Is not thread safe.
// We should always call Stop(), but we need to verify that it is idempotent
if engine.wgInterface != nil {
log.Infof("ensuring %s is removed, Netbird engine context cancelled", engine.wgInterface.Name())
if err := engine.Stop(); err != nil {
log.Errorf("Failed to stop engine: %v", err)
if err := engine.Stop(); err != nil {
log.Errorf("Failed to stop engine: %v", err)
}
}
c.statusRecorder.ClientTeardown()
@@ -445,12 +430,12 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
}
c.statusRecorder.ClientStart()
err = backoff.Retry(operation, backoff.WithContext(backOff, c.ctx))
err = backoff.Retry(operation, backOff)
if err != nil {
log.Debugf("exiting client retry loop due to unrecoverable error: %s", err)
if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
state.Set(StatusNeedsLogin)
c.runCancel()
_ = c.Stop()
}
return err
}
@@ -528,9 +513,11 @@ func (c *ConnectClient) Status() StatusType {
}
func (c *ConnectClient) Stop() error {
c.runCancel()
if c.runStarted.Load() {
<-c.runExited
engine := c.Engine()
if engine != nil {
if err := engine.Stop(); err != nil {
return fmt.Errorf("stop engine: %w", err)
}
}
return nil
}

View File

@@ -250,7 +250,6 @@ type BundleGenerator struct {
syncResponse *mgmProto.SyncResponse
logPath string
tempDir string
statePath string
cpuProfile []byte
capturePath string
refreshStatus func() // Optional callback to refresh status before bundle generation
@@ -277,7 +276,6 @@ type GeneratorDependencies struct {
SyncResponse *mgmProto.SyncResponse
LogPath string
TempDir string // Directory for temporary bundle zip files. If empty, os.TempDir() is used.
StatePath string // Path to the state file. If empty, the ServiceManager default path is used.
CPUProfile []byte
CapturePath string
RefreshStatus func()
@@ -301,7 +299,6 @@ func NewBundleGenerator(deps GeneratorDependencies, cfg BundleConfig) *BundleGen
syncResponse: deps.SyncResponse,
logPath: deps.LogPath,
tempDir: deps.TempDir,
statePath: deps.StatePath,
cpuProfile: deps.CPUProfile,
capturePath: deps.CapturePath,
refreshStatus: deps.RefreshStatus,
@@ -853,11 +850,8 @@ func (g *BundleGenerator) maskSecrets() {
}
func (g *BundleGenerator) addStateFile() error {
path := g.statePath
if path == "" {
sm := profilemanager.NewServiceManager("")
path = sm.GetStatePath()
}
sm := profilemanager.NewServiceManager("")
path := sm.GetStatePath()
if path == "" {
return nil
}

View File

@@ -1,36 +0,0 @@
//go:build ios
package debug
import (
"path/filepath"
log "github.com/sirupsen/logrus"
)
// swiftLogFile is the Swift app log written by the iOS app into the same log
// directory as the Go client log, so it can be collected into the bundle.
const swiftLogFile = "swift-log.log"
// addPlatformLog collects logs for the iOS debug bundle. iOS has no logcat or
// systemd journal, so we rely on file-based logs. addLogfile handles the Go
// client log (logPath) with rotation, the stderr/stdout companions and
// anonymization. The iOS app writes its own Swift log into the same directory,
// so we add it alongside the Go log.
func (g *BundleGenerator) addPlatformLog() error {
if err := g.addLogfile(); err != nil {
return err
}
if g.logPath == "" {
return nil
}
swiftLogPath := filepath.Join(filepath.Dir(g.logPath), swiftLogFile)
if err := g.addSingleLogfile(swiftLogPath, swiftLogFile); err != nil {
// The Swift log is best-effort: the app may not have written it yet.
log.Warnf("failed to add %s to debug bundle: %v", swiftLogFile, err)
}
return nil
}

View File

@@ -1,4 +1,4 @@
//go:build !android && !ios
//go:build !android
package debug

View File

@@ -843,7 +843,6 @@ func TestAddConfig_AllFieldsCovered(t *testing.T) {
"PreSharedKey": "sensitive: WireGuard pre-shared key",
"SSHKey": "sensitive: SSH private key",
"ClientCertKeyPair": "non-config: parsed cert pair, not serialized",
"Name": "non-config: profile name is not needed for debug purposes",
"policy": "non-config: in-memory MDM policy snapshot, surfaced via Config.Policy() / GetConfigResponse.MDMManagedFields",
}

View File

@@ -51,20 +51,13 @@ type cachedRecord struct {
}
// Resolver caches critical NetBird infrastructure domains.
// records, refreshing, failedResolves, mgmtDomain and serverDomains are all
// guarded by mutex.
// records, refreshing, mgmtDomain and serverDomains are all guarded by mutex.
type Resolver struct {
records map[dns.Question]*cachedRecord
mgmtDomain *domain.Domain
serverDomains *dnsconfig.ServerDomains
mutex sync.RWMutex
// failedResolves records the last failed initial resolve per domain so a
// domain that never resolves isn't retried on every server-domains update
// until refreshBackoff elapses. Entries are cleared on success and pruned
// to the current server-domains set.
failedResolves map[domain.Domain]time.Time
chain ChainResolver
chainMaxPriority int
refreshGroup singleflight.Group
@@ -83,10 +76,9 @@ type Resolver struct {
// NewResolver creates a new management domains cache resolver.
func NewResolver() *Resolver {
return &Resolver{
records: make(map[dns.Question]*cachedRecord),
refreshing: make(map[dns.Question]*atomic.Bool),
failedResolves: make(map[domain.Domain]time.Time),
cacheTTL: resolveCacheTTL(),
records: make(map[dns.Question]*cachedRecord),
refreshing: make(map[dns.Question]*atomic.Bool),
cacheTTL: resolveCacheTTL(),
}
}
@@ -181,9 +173,7 @@ func (m *Resolver) continueToNext(w dns.ResponseWriter, r *dns.Msg) {
// AddDomain resolves a domain and stores its A/AAAA records in the cache.
// A family that resolves NODATA (nil err, zero records) evicts any stale
// entry for that qtype. When one family hard-errors while the other succeeds,
// the resolved family is still cached but AddDomain returns an error so the
// caller retries the incomplete resolve rather than treating it as complete.
// entry for that qtype.
func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
@@ -213,10 +203,6 @@ func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
log.Debugf("added/updated domain=%s with %d A records and %d AAAA records",
d.SafeString(), len(aRecords), len(aaaaRecords))
if errA != nil || errAAAA != nil {
return fmt.Errorf("resolve %s: incomplete, a family failed: %w", d.SafeString(), errors.Join(errA, errAAAA))
}
return nil
}
@@ -476,7 +462,6 @@ func (m *Resolver) RemoveDomain(d domain.Domain) error {
delete(m.records, qAAAA)
delete(m.refreshing, qA)
delete(m.refreshing, qAAAA)
delete(m.failedResolves, d)
log.Debugf("removed domain=%s from cache", d.SafeString())
return nil
@@ -520,7 +505,6 @@ func (m *Resolver) UpdateFromServerDomains(ctx context.Context, serverDomains dn
allDomains := m.extractDomainsFromServerDomains(updatedServerDomains)
currentDomains := m.GetCachedDomains()
removedDomains = m.removeStaleDomains(currentDomains, allDomains)
m.pruneFailedResolves(allDomains)
}
m.addNewDomains(ctx, newDomains)
@@ -593,85 +577,13 @@ func (m *Resolver) isManagementDomain(domain domain.Domain) bool {
return m.mgmtDomain != nil && domain == *m.mgmtDomain
}
// addNewDomains resolves and caches domains that are not yet in the cache,
// running the lookups concurrently. Domains already cached are skipped and left
// to the stale-while-revalidate refresh path, so a sync never re-resolves them
// synchronously: once NetBird owns the OS resolver the resolve runs through the
// handler chain and would otherwise dial the managed upstreams under the engine
// sync lock on every update.
// addNewDomains resolves and caches all domains from the update
func (m *Resolver) addNewDomains(ctx context.Context, newDomains domain.List) {
var wg sync.WaitGroup
seen := make(map[domain.Domain]struct{}, len(newDomains))
for _, newDomain := range newDomains {
if _, dup := seen[newDomain]; dup {
continue
}
seen[newDomain] = struct{}{}
if !m.needsResolve(newDomain) {
continue
}
wg.Add(1)
go func(d domain.Domain) {
defer wg.Done()
if err := m.AddDomain(ctx, d); err != nil {
m.markResolveFailed(d)
log.Warnf("failed to add/update domain=%s: %v", d.SafeString(), err)
return
}
m.clearResolveFailed(d)
log.Debugf("added/updated management cache domain=%s", d.SafeString())
}(newDomain)
}
wg.Wait()
}
// needsResolve reports whether d should be resolved now. A recent failed or
// incomplete resolve gates retries on the backoff even when one family is
// already cached, so a transiently-failed family is retried instead of being
// treated as fully resolved. Otherwise a domain with any cached record is left
// to the stale-while-revalidate refresh path.
func (m *Resolver) needsResolve(d domain.Domain) bool {
dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
m.mutex.RLock()
defer m.mutex.RUnlock()
if failedAt, ok := m.failedResolves[d]; ok {
return time.Since(failedAt) >= refreshBackoff
}
for _, qtype := range []uint16{dns.TypeA, dns.TypeAAAA} {
q := dns.Question{Name: dnsName, Qtype: qtype, Qclass: dns.ClassINET}
if _, ok := m.records[q]; ok {
return false
}
}
return true
}
func (m *Resolver) markResolveFailed(d domain.Domain) {
m.mutex.Lock()
m.failedResolves[d] = time.Now()
m.mutex.Unlock()
}
func (m *Resolver) clearResolveFailed(d domain.Domain) {
m.mutex.Lock()
delete(m.failedResolves, d)
m.mutex.Unlock()
}
// pruneFailedResolves drops failure markers for domains no longer present in
// the server-domains set, keeping the map bounded to the current set (a
// failed-only domain has no cached record, so RemoveDomain never sees it).
func (m *Resolver) pruneFailedResolves(domains domain.List) {
m.mutex.Lock()
defer m.mutex.Unlock()
for d := range m.failedResolves {
if !slices.Contains(domains, d) {
delete(m.failedResolves, d)
if err := m.AddDomain(ctx, newDomain); err != nil {
log.Warnf("failed to add/update domain=%s: %v", newDomain.SafeString(), err)
} else {
log.Debugf("added/updated management cache domain=%s", newDomain.SafeString())
}
}
}

View File

@@ -21,7 +21,6 @@ type fakeChain struct {
mu sync.Mutex
calls map[string]int
answers map[string][]dns.RR
qErr map[string]error
err error
hasRoot bool
onLookup func()
@@ -31,7 +30,6 @@ func newFakeChain() *fakeChain {
return &fakeChain{
calls: map[string]int{},
answers: map[string][]dns.RR{},
qErr: map[string]error{},
hasRoot: true,
}
}
@@ -49,9 +47,6 @@ func (f *fakeChain) ResolveInternal(ctx context.Context, msg *dns.Msg, maxPriori
f.calls[key]++
answers := f.answers[key]
err := f.err
if err == nil {
err = f.qErr[key]
}
onLookup := f.onLookup
f.mu.Unlock()
@@ -80,12 +75,6 @@ func (f *fakeChain) setAnswer(name string, qtype uint16, ip string) {
}
}
func (f *fakeChain) setErr(name string, qtype uint16, err error) {
f.mu.Lock()
defer f.mu.Unlock()
f.qErr[name+"|"+dns.TypeToString[qtype]] = err
}
func (f *fakeChain) callCount(name string, qtype uint16) int {
f.mu.Lock()
defer f.mu.Unlock()

View File

@@ -1,183 +0,0 @@
package mgmt
import (
"context"
"errors"
"sync/atomic"
"testing"
"time"
"github.com/miekg/dns"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config"
"github.com/netbirdio/netbird/shared/management/domain"
)
// A domain already in the cache must not be re-resolved on a subsequent server
// domains update; it is left to the stale-while-revalidate refresh path.
func TestResolver_UpdateFromServerDomains_SkipsCached(t *testing.T) {
r := NewResolver()
chain := newFakeChain()
chain.setAnswer("signal.example.com.", dns.TypeA, "10.0.0.2")
r.SetChainResolver(chain, 50)
sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
_, err := r.UpdateFromServerDomains(context.Background(), sd)
require.NoError(t, err)
require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
"first update must resolve the domain")
_, err = r.UpdateFromServerDomains(context.Background(), sd)
require.NoError(t, err)
assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
"cached domain must not be re-resolved on a subsequent update")
}
// New domains in a single update must resolve concurrently rather than serially.
func TestResolver_AddNewDomains_ResolvesConcurrently(t *testing.T) {
r := NewResolver()
chain := newFakeChain()
var inflight, maxInflight atomic.Int32
chain.onLookup = func() {
n := inflight.Add(1)
for {
old := maxInflight.Load()
if n <= old || maxInflight.CompareAndSwap(old, n) {
break
}
}
time.Sleep(50 * time.Millisecond)
inflight.Add(-1)
}
relays := []domain.Domain{"a.example.com", "b.example.com", "c.example.com", "d.example.com"}
for _, d := range relays {
chain.setAnswer(dns.Fqdn(string(d)), dns.TypeA, "10.0.0.2")
}
r.SetChainResolver(chain, 50)
start := time.Now()
_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: relays})
require.NoError(t, err)
elapsed := time.Since(start)
assert.GreaterOrEqual(t, int(maxInflight.Load()), 2, "domains must resolve concurrently")
// Serial resolution of 4 domains would take at least 4*50ms; concurrent is far less.
assert.Less(t, elapsed, 300*time.Millisecond, "resolution should not be serial")
}
// A domain that fails to resolve must not be retried on every update; the
// failure backoff suppresses re-resolution until it expires.
func TestResolver_UpdateFromServerDomains_BacksOffFailures(t *testing.T) {
r := NewResolver()
chain := newFakeChain()
chain.err = errors.New("resolve boom")
r.SetChainResolver(chain, 50)
sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
_, err := r.UpdateFromServerDomains(context.Background(), sd)
require.NoError(t, err)
require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
"first update must attempt the resolve")
_, err = r.UpdateFromServerDomains(context.Background(), sd)
require.NoError(t, err)
assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
"failed resolve must back off and not retry on the next update")
}
// A domain listed under more than one server-domain type (e.g. STUN and TURN on
// the same host) must be resolved once per update, not once per occurrence.
func TestResolver_AddNewDomains_DedupesDuplicateDomains(t *testing.T) {
r := NewResolver()
chain := newFakeChain()
chain.setAnswer("dup.example.com.", dns.TypeA, "10.0.0.9")
r.SetChainResolver(chain, 50)
sd := dnsconfig.ServerDomains{
Stuns: []domain.Domain{"dup.example.com"},
Turns: []domain.Domain{"dup.example.com"},
}
_, err := r.UpdateFromServerDomains(context.Background(), sd)
require.NoError(t, err)
assert.Equal(t, 1, chain.callCount("dup.example.com.", dns.TypeA),
"a domain appearing under multiple server-domain types must resolve once")
}
// A failure marker must be dropped once its domain leaves the server-domains set
// so the map stays bounded to the current set.
func TestResolver_UpdateFromServerDomains_PrunesFailedResolves(t *testing.T) {
r := NewResolver()
chain := newFakeChain()
chain.err = errors.New("resolve boom")
r.SetChainResolver(chain, 50)
_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("gone.example.com")})
require.NoError(t, err)
r.mutex.RLock()
_, marked := r.failedResolves[domain.Domain("gone.example.com")]
r.mutex.RUnlock()
require.True(t, marked, "failed resolve must be recorded")
_, err = r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("other.example.com")})
require.NoError(t, err)
r.mutex.RLock()
_, stillMarked := r.failedResolves[domain.Domain("gone.example.com")]
r.mutex.RUnlock()
assert.False(t, stillMarked, "failure marker for a domain no longer in the set must be pruned")
}
// When one family hard-errors while the other resolves, the domain is cached
// for the working family but recorded as incomplete so the failed family is
// retried under backoff instead of being treated as fully resolved forever.
func TestResolver_AddNewDomains_RetriesPartialFamilyFailure(t *testing.T) {
d := domain.Domain("relay.example.com")
r := NewResolver()
chain := newFakeChain()
chain.setAnswer("relay.example.com.", dns.TypeA, "10.0.0.2")
chain.setErr("relay.example.com.", dns.TypeAAAA, errors.New("servfail"))
r.SetChainResolver(chain, 50)
_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
require.NoError(t, err)
r.mutex.RLock()
_, aCached := r.records[dns.Question{Name: "relay.example.com.", Qtype: dns.TypeA, Qclass: dns.ClassINET}]
_, marked := r.failedResolves[d]
r.mutex.RUnlock()
require.True(t, aCached, "the working family must still be cached")
require.True(t, marked, "a partial failure must be recorded so the failed family is retried")
assert.False(t, r.needsResolve(d), "within the backoff window the domain is not retried")
r.mutex.Lock()
r.failedResolves[d] = time.Now().Add(-2 * refreshBackoff)
r.mutex.Unlock()
assert.True(t, r.needsResolve(d), "after the backoff elapses the domain is retried to pick up the missing family")
}
// A family that returns NODATA (legitimately absent, e.g. an IPv4-only host) is
// not a failure: the domain must not be marked for retry, otherwise it would be
// re-resolved on every sync.
func TestResolver_AddNewDomains_NodataIsNotFailure(t *testing.T) {
d := domain.Domain("v4only.example.com")
r := NewResolver()
chain := newFakeChain()
chain.setAnswer("v4only.example.com.", dns.TypeA, "10.0.0.2")
r.SetChainResolver(chain, 50)
_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
require.NoError(t, err)
r.mutex.RLock()
_, marked := r.failedResolves[d]
r.mutex.RUnlock()
assert.False(t, marked, "a NODATA family must not be recorded as a failure")
assert.False(t, r.needsResolve(d), "an IPv4-only host must not be re-resolved on later syncs")
}

View File

@@ -207,35 +207,3 @@ func FormatAnswers(answers []dns.RR) string {
}
return "[" + strings.Join(parts, ", ") + "]"
}
// StripOPT removes any OPT pseudo-RRs from the message's Extra section. Per
// RFC 6891 a responder must not include an OPT RR toward a client that did not
// advertise EDNS0.
func StripOPT(msg *dns.Msg) {
if len(msg.Extra) == 0 {
return
}
out := msg.Extra[:0]
for _, rr := range msg.Extra {
if _, ok := rr.(*dns.OPT); ok {
continue
}
out = append(out, rr)
}
msg.Extra = out
}
// ExtractEDE returns the first Extended DNS Error (RFC 8914) option carried in
// the message, if present.
func ExtractEDE(msg *dns.Msg) (*dns.EDNS0_EDE, bool) {
opt := msg.IsEdns0()
if opt == nil {
return nil, false
}
for _, o := range opt.Option {
if ede, ok := o.(*dns.EDNS0_EDE); ok {
return ede, true
}
}
return nil, false
}

View File

@@ -120,42 +120,3 @@ func TestLookupIP_DNSErrorNotIsNotFound(t *testing.T) {
assert.Equal(t, dns.RcodeServerFailure, result.Rcode, "upstream failure should map to SERVFAIL")
}
func TestStripOPT(t *testing.T) {
rm := &dns.Msg{
Extra: []dns.RR{
&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
},
}
StripOPT(rm)
assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
_, isOPT := rm.Extra[0].(*dns.OPT)
assert.False(t, isOPT, "remaining record must not be OPT")
}
func TestExtractEDE(t *testing.T) {
t.Run("no edns", func(t *testing.T) {
_, ok := ExtractEDE(&dns.Msg{})
assert.False(t, ok, "message without OPT has no EDE")
})
t.Run("edns without ede", func(t *testing.T) {
rm := &dns.Msg{}
rm.SetEdns0(4096, false)
_, ok := ExtractEDE(rm)
assert.False(t, ok, "OPT without EDE option returns false")
})
t.Run("with ede", func(t *testing.T) {
rm := &dns.Msg{}
opt := &dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}}
opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: 49152, ExtraText: "upstream timeout"})
rm.Extra = append(rm.Extra, opt)
ede, ok := ExtractEDE(rm)
assert.True(t, ok, "EDE option should be found")
assert.Equal(t, uint16(49152), ede.InfoCode)
assert.Equal(t, "upstream timeout", ede.ExtraText)
})
}

View File

@@ -6,7 +6,6 @@ import (
"fmt"
"net/netip"
"net/url"
"os"
"slices"
"strings"
"sync"
@@ -39,15 +38,11 @@ const (
// defaultWarningDelayBase is the starting grace window before a
// "Nameserver group unreachable" event fires for a group that's
// never been healthy and only has overlay upstreams with no
// Connected peer. Per-server and overridable via envWarningDelay;
// see warningDelay.
defaultWarningDelayBase = 60 * time.Second
// Connected peer. Per-server and overridable; see warningDelayFor.
defaultWarningDelayBase = 30 * time.Second
// warningDelayBonusCap caps the route-count bonus added to the
// base grace window. See warningDelay.
// base grace window. See warningDelayFor.
warningDelayBonusCap = 30 * time.Second
// envWarningDelay overrides defaultWarningDelayBase with a Go duration
// string (e.g. "90s", "2m"). Invalid or non-positive values are ignored.
envWarningDelay = "NB_DNS_HEALTH_WARNING_DELAY"
)
// errNoUsableNameservers signals that a merged-domain group has no usable
@@ -140,7 +135,7 @@ type DefaultServer struct {
disableSys bool
mux sync.Mutex
service service
dnsMuxHandlers []handlerWrapper
dnsMuxMap registeredHandlerMap
localResolver *local.Resolver
wgInterface WGIface
hostManager hostManager
@@ -204,6 +199,8 @@ type handlerWrapper struct {
priority int
}
type registeredHandlerMap map[types.HandlerID]handlerWrapper
// DefaultServerConfig holds configuration parameters for NewDefaultServer
type DefaultServerConfig struct {
WgInterface WGIface
@@ -292,6 +289,7 @@ func newDefaultServer(
service: dnsService,
handlerChain: handlerChain,
extraDomains: make(map[domain.Domain]int),
dnsMuxMap: make(registeredHandlerMap),
localResolver: local.NewResolver(),
wgInterface: wgInterface,
statusRecorder: statusRecorder,
@@ -300,7 +298,7 @@ func newDefaultServer(
hostManager: &noopHostConfigurator{},
mgmtCacheResolver: mgmtCacheResolver,
currentConfigHash: ^uint64(0), // Initialize to max uint64 to ensure first config is always applied
warningDelayBase: warningDelayBaseFromEnv(),
warningDelayBase: defaultWarningDelayBase,
healthRefresh: make(chan struct{}, 1),
}
// Wire the local resolver against the peer status recorder so it can
@@ -330,7 +328,7 @@ func (s *DefaultServer) SetRouteSources(selected, active func() route.HAMap) {
type routeSettable interface {
setSelectedRoutes(func() route.HAMap)
}
for _, entry := range s.dnsMuxHandlers {
for _, entry := range s.dnsMuxMap {
if h, ok := entry.handler.(routeSettable); ok {
h.setSelectedRoutes(selected)
}
@@ -980,23 +978,19 @@ func (s *DefaultServer) usableNameServers(nameServers []nbdns.NameServer) []neti
func (s *DefaultServer) updateMux(muxUpdates []handlerWrapper) {
// this will introduce a short period of time when the server is not able to handle DNS requests
for _, existing := range s.dnsMuxHandlers {
for _, existing := range s.dnsMuxMap {
s.deregisterHandler([]string{existing.domain}, existing.priority)
// The local resolver is a persistent singleton shared by every custom
// zone and reused across config updates. Its chain registrations are
// per-config and must be deregistered, but Stop() cancels its lookup
// context (breaking external CNAME-target resolution) and clears its
// records, so it must not be torn down here.
if existing.handler != s.localResolver {
existing.handler.Stop()
}
existing.handler.Stop()
}
muxUpdateMap := make(registeredHandlerMap)
for _, update := range muxUpdates {
s.registerHandler([]string{update.domain}, update.handler, update.priority)
muxUpdateMap[update.handler.ID()] = update
}
s.dnsMuxHandlers = muxUpdates
s.dnsMuxMap = muxUpdateMap
}
// updateNSGroupStates records the new group set and pokes the refresher.
@@ -1160,26 +1154,6 @@ func (s *DefaultServer) projectUnhealthy(p *nsGroupProj, servers []netip.AddrPor
return false
}
// warningDelayBaseFromEnv returns the base grace window, honoring
// envWarningDelay when it holds a valid positive Go duration. Invalid or
// non-positive values fall back to defaultWarningDelayBase.
func warningDelayBaseFromEnv() time.Duration {
val := os.Getenv(envWarningDelay)
if val == "" {
return defaultWarningDelayBase
}
d, err := time.ParseDuration(val)
if err != nil {
log.Warnf("invalid %s value %q, using default %v: %v", envWarningDelay, val, defaultWarningDelayBase, err)
return defaultWarningDelayBase
}
if d <= 0 {
log.Warnf("%s must be positive, got %v, using default %v", envWarningDelay, d, defaultWarningDelayBase)
return defaultWarningDelayBase
}
return d
}
// warningDelay returns the grace window for the given selected-route
// count. Scales gently: +1s per 100 routes, capped by
// warningDelayBonusCap. Parallel handshakes mean handshake time grows
@@ -1230,7 +1204,7 @@ func (s *DefaultServer) groupHasImmediateUpstream(servers []netip.AddrPort, snap
// in more than one handler.
func (s *DefaultServer) collectUpstreamHealth() map[netip.AddrPort]UpstreamHealth {
merged := make(map[netip.AddrPort]UpstreamHealth)
for _, entry := range s.dnsMuxHandlers {
for _, entry := range s.dnsMuxMap {
reporter, ok := entry.handler.(upstreamHealthReporter)
if !ok {
continue

View File

@@ -104,6 +104,19 @@ func init() {
formatter.SetTextFormatter(log.StandardLogger())
}
func generateDummyHandler(d string, servers []nbdns.NameServer) *upstreamResolverBase {
var srvs []netip.AddrPort
for _, srv := range servers {
srvs = append(srvs, srv.AddrPort())
}
u := &upstreamResolverBase{
domain: domain.Domain(d),
cancel: func() {},
}
u.addRace(srvs)
return u
}
func TestUpdateDNSServer(t *testing.T) {
nameServers := []nbdns.NameServer{
@@ -119,20 +132,22 @@ func TestUpdateDNSServer(t *testing.T) {
},
}
dummyHandler := local.NewResolver()
testCases := []struct {
name string
initUpstreamMap []handlerWrapper
initUpstreamMap registeredHandlerMap
initLocalZones []nbdns.CustomZone
initSerial uint64
inputSerial uint64
inputUpdate nbdns.Config
shouldFail bool
expectedUpstreamMap []handlerWrapper
expectedUpstreamMap registeredHandlerMap
expectedLocalQs []dns.Question
}{
{
name: "Initial Config Should Succeed",
initUpstreamMap: nil,
initUpstreamMap: make(registeredHandlerMap),
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{
@@ -154,17 +169,20 @@ func TestUpdateDNSServer(t *testing.T) {
},
},
},
expectedUpstreamMap: []handlerWrapper{
{
expectedUpstreamMap: registeredHandlerMap{
generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
domain: "netbird.io",
handler: dummyHandler,
priority: PriorityUpstream,
},
{
dummyHandler.ID(): handlerWrapper{
domain: "netbird.cloud",
handler: dummyHandler,
priority: PriorityLocal,
},
{
generateDummyHandler(".", nameServers).ID(): handlerWrapper{
domain: nbdns.RootZone,
handler: dummyHandler,
priority: PriorityDefault,
},
},
@@ -173,10 +191,10 @@ func TestUpdateDNSServer(t *testing.T) {
{
name: "New Config Should Succeed",
initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: 1, Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
initUpstreamMap: []handlerWrapper{
{
initUpstreamMap: registeredHandlerMap{
generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
domain: "netbird.cloud",
handler: &mockHandler{},
handler: dummyHandler,
priority: PriorityUpstream,
},
},
@@ -197,13 +215,15 @@ func TestUpdateDNSServer(t *testing.T) {
},
},
},
expectedUpstreamMap: []handlerWrapper{
{
expectedUpstreamMap: registeredHandlerMap{
generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
domain: "netbird.io",
handler: dummyHandler,
priority: PriorityUpstream,
},
{
"local-resolver": handlerWrapper{
domain: "netbird.cloud",
handler: dummyHandler,
priority: PriorityLocal,
},
},
@@ -212,7 +232,7 @@ func TestUpdateDNSServer(t *testing.T) {
{
name: "Smaller Config Serial Should Be Skipped",
initLocalZones: []nbdns.CustomZone{},
initUpstreamMap: nil,
initUpstreamMap: make(registeredHandlerMap),
initSerial: 2,
inputSerial: 1,
shouldFail: true,
@@ -220,7 +240,7 @@ func TestUpdateDNSServer(t *testing.T) {
{
name: "Empty NS Group Domain Or Not Primary Element Should Fail",
initLocalZones: []nbdns.CustomZone{},
initUpstreamMap: nil,
initUpstreamMap: make(registeredHandlerMap),
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{
@@ -242,7 +262,7 @@ func TestUpdateDNSServer(t *testing.T) {
{
name: "Invalid NS Group Nameservers list Should Fail",
initLocalZones: []nbdns.CustomZone{},
initUpstreamMap: nil,
initUpstreamMap: make(registeredHandlerMap),
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{
@@ -264,7 +284,7 @@ func TestUpdateDNSServer(t *testing.T) {
{
name: "Invalid Custom Zone Records list Should Skip",
initLocalZones: []nbdns.CustomZone{},
initUpstreamMap: nil,
initUpstreamMap: make(registeredHandlerMap),
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{
@@ -281,41 +301,42 @@ func TestUpdateDNSServer(t *testing.T) {
},
},
},
expectedUpstreamMap: []handlerWrapper{{
expectedUpstreamMap: registeredHandlerMap{generateDummyHandler(".", nameServers).ID(): handlerWrapper{
domain: ".",
handler: dummyHandler,
priority: PriorityDefault,
}},
},
{
name: "Empty Config Should Succeed and Clean Maps",
initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
initUpstreamMap: []handlerWrapper{
{
initUpstreamMap: registeredHandlerMap{
generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
domain: zoneRecords[0].Name,
handler: &mockHandler{},
handler: dummyHandler,
priority: PriorityUpstream,
},
},
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{ServiceEnable: true},
expectedUpstreamMap: nil,
expectedUpstreamMap: make(registeredHandlerMap),
expectedLocalQs: []dns.Question{},
},
{
name: "Disabled Service Should clean map",
initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
initUpstreamMap: []handlerWrapper{
{
initUpstreamMap: registeredHandlerMap{
generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
domain: zoneRecords[0].Name,
handler: &mockHandler{},
handler: dummyHandler,
priority: PriorityUpstream,
},
},
initSerial: 0,
inputSerial: 1,
inputUpdate: nbdns.Config{ServiceEnable: false},
expectedUpstreamMap: nil,
expectedUpstreamMap: make(registeredHandlerMap),
expectedLocalQs: []dns.Question{},
},
}
@@ -372,7 +393,7 @@ func TestUpdateDNSServer(t *testing.T) {
}
}()
dnsServer.dnsMuxHandlers = testCase.initUpstreamMap
dnsServer.dnsMuxMap = testCase.initUpstreamMap
dnsServer.localResolver.Update(testCase.initLocalZones)
dnsServer.updateSerial = testCase.initSerial
@@ -384,20 +405,14 @@ func TestUpdateDNSServer(t *testing.T) {
t.Fatalf("update dns server should not fail, got error: %v", err)
}
if len(dnsServer.dnsMuxHandlers) != len(testCase.expectedUpstreamMap) {
t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxHandlers))
if len(dnsServer.dnsMuxMap) != len(testCase.expectedUpstreamMap) {
t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxMap))
}
for _, expected := range testCase.expectedUpstreamMap {
found := false
for _, got := range dnsServer.dnsMuxHandlers {
if got.domain == expected.domain && got.priority == expected.priority {
found = true
break
}
}
for key := range testCase.expectedUpstreamMap {
_, found := dnsServer.dnsMuxMap[key]
if !found {
t.Fatalf("update upstream failed, handler for domain=%s priority=%d not found in dnsMuxHandlers: %#v", expected.domain, expected.priority, dnsServer.dnsMuxHandlers)
t.Fatalf("update upstream failed, key %s was not found in the dnsMuxMap: %#v", key, dnsServer.dnsMuxMap)
}
}
@@ -497,8 +512,8 @@ func TestDNSFakeResolverHandleUpdates(t *testing.T) {
}
}()
dnsServer.dnsMuxHandlers = []handlerWrapper{
{
dnsServer.dnsMuxMap = registeredHandlerMap{
"id1": handlerWrapper{
domain: zoneRecords[0].Name,
handler: &local.Resolver{},
priority: PriorityUpstream,
@@ -1014,15 +1029,15 @@ func (m *mockService) RegisterMux(string, dns.Handler) {}
func (m *mockService) DeregisterMux(string) {}
func TestDefaultServer_UpdateMux(t *testing.T) {
baseMatchHandlers := []handlerWrapper{
{
baseMatchHandlers := registeredHandlerMap{
"upstream-group1": {
domain: "example.com",
handler: &mockHandler{
Id: "upstream-group1",
},
priority: PriorityUpstream,
},
{
"upstream-group2": {
domain: "example.com",
handler: &mockHandler{
Id: "upstream-group2",
@@ -1031,15 +1046,15 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
},
}
baseRootHandlers := []handlerWrapper{
{
baseRootHandlers := registeredHandlerMap{
"upstream-root1": {
domain: ".",
handler: &mockHandler{
Id: "upstream-root1",
},
priority: PriorityDefault,
},
{
"upstream-root2": {
domain: ".",
handler: &mockHandler{
Id: "upstream-root2",
@@ -1048,22 +1063,22 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
},
}
baseMixedHandlers := []handlerWrapper{
{
baseMixedHandlers := registeredHandlerMap{
"upstream-group1": {
domain: "example.com",
handler: &mockHandler{
Id: "upstream-group1",
},
priority: PriorityUpstream,
},
{
"upstream-group2": {
domain: "example.com",
handler: &mockHandler{
Id: "upstream-group2",
},
priority: PriorityUpstream - 1,
},
{
"upstream-other": {
domain: "other.com",
handler: &mockHandler{
Id: "upstream-other",
@@ -1074,7 +1089,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
tests := []struct {
name string
initialHandlers []handlerWrapper
initialHandlers registeredHandlerMap
updates []handlerWrapper
expectedHandlers map[string]string // map[HandlerID]domain
description string
@@ -1358,38 +1373,32 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
server := &DefaultServer{
dnsMuxHandlers: tt.initialHandlers,
handlerChain: NewHandlerChain(),
service: &mockService{},
dnsMuxMap: tt.initialHandlers,
handlerChain: NewHandlerChain(),
service: &mockService{},
}
// Perform the update
server.updateMux(tt.updates)
// Verify the results
assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxHandlers),
assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxMap),
"Number of handlers after update doesn't match expected")
// Check each expected handler
for id, expectedDomain := range tt.expectedHandlers {
var found *handlerWrapper
for i := range server.dnsMuxHandlers {
if server.dnsMuxHandlers[i].handler.ID() == types.HandlerID(id) {
found = &server.dnsMuxHandlers[i]
break
}
}
assert.NotNil(t, found, "Expected handler %s not found", id)
if found != nil {
assert.Equal(t, expectedDomain, found.domain,
handler, exists := server.dnsMuxMap[types.HandlerID(id)]
assert.True(t, exists, "Expected handler %s not found", id)
if exists {
assert.Equal(t, expectedDomain, handler.domain,
"Domain mismatch for handler %s", id)
}
}
// Verify no unexpected handlers exist
for _, entry := range server.dnsMuxHandlers {
_, expected := tt.expectedHandlers[string(entry.handler.ID())]
assert.True(t, expected, "Unexpected handler found: %s", entry.handler.ID())
for HandlerID := range server.dnsMuxMap {
_, expected := tt.expectedHandlers[string(HandlerID)]
assert.True(t, expected, "Unexpected handler found: %s", HandlerID)
}
// Verify the handlerChain state and order
@@ -1404,7 +1413,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
// Verify handler exists in mux
foundInMux := false
for _, muxEntry := range server.dnsMuxHandlers {
for _, muxEntry := range server.dnsMuxMap {
if chainEntry.Handler == muxEntry.handler &&
chainEntry.Priority == muxEntry.priority &&
chainEntry.Pattern == dns.Fqdn(muxEntry.domain) {
@@ -1413,108 +1422,12 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
}
}
assert.True(t, foundInMux,
"Handler in chain not found in dnsMuxHandlers")
"Handler in chain not found in dnsMuxMap")
}
})
}
}
// chainHasPattern reports whether the handler chain holds an entry registered
// for the given fqdn pattern at the given priority.
func chainHasPattern(s *DefaultServer, pattern string, priority int) bool {
for _, h := range s.handlerChain.handlers {
if h.OrigPattern == pattern && h.Priority == priority {
return true
}
}
return false
}
// TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval verifies that updateMux
// tracks each (handler, domain) registration independently when one handler
// serves multiple zones. Every custom zone is served by the same handler
// instance (the local resolver, whose ID is the constant "local-resolver"), so
// removing one zone must deregister exactly that zone's chain entry and leave
// the others in place. Tracking registrations by handler ID alone collapses all
// zones onto one entry, leaving removed zones in the chain to answer
// authoritatively with no records.
func TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval(t *testing.T) {
// One handler serves every custom zone, mirroring s.localResolver.
shared := &mockHandler{Id: "local-resolver"}
server := &DefaultServer{
handlerChain: NewHandlerChain(),
service: &mockService{},
}
// Two custom zones under the same handler. The surviving zone is registered
// last, mirroring the management emission order.
server.updateMux([]handlerWrapper{
{domain: "userzone.test", handler: shared, priority: PriorityLocal},
{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
})
require.True(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
"userzone.test should be registered after the first update")
require.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
"peerzone.test should be registered after the first update")
// Remove one zone, keep the other.
server.updateMux([]handlerWrapper{
{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
})
assert.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
"peerzone.test should remain after removing userzone.test")
assert.False(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
"userzone.test handler must be deregistered, not leaked in the chain")
}
// TestDefaultServer_UpdateMux_PreservesLocalResolver verifies that updateMux
// does not tear down the shared local resolver during reconfiguration. The
// resolver is a process-lifetime singleton reused across config updates;
// Stop() cancels its lookup context (breaking external CNAME-target
// resolution) and clears its records. updateMux must deregister its chain
// entries without stopping it. Records surviving a teardown update is the
// observable proxy: Stop() would have cleared them.
func TestDefaultServer_UpdateMux_PreservesLocalResolver(t *testing.T) {
resolver := local.NewResolver()
require.NoError(t, resolver.RegisterRecord(nbdns.SimpleRecord{
Name: "peer.netbird.cloud.",
Type: int(dns.TypeA),
Class: nbdns.DefaultClass,
TTL: 300,
RData: "10.0.0.1",
}))
server := &DefaultServer{
handlerChain: NewHandlerChain(),
service: &mockService{},
localResolver: resolver,
}
server.updateMux([]handlerWrapper{
{domain: "netbird.cloud", handler: resolver, priority: PriorityLocal},
})
// Remove the zone. The resolver must survive so its records and lookup
// context stay intact for the next registration.
server.updateMux(nil)
var response *dns.Msg
resolver.ServeDNS(&test.MockResponseWriter{
WriteMsgFunc: func(m *dns.Msg) error {
response = m
return nil
},
}, &dns.Msg{Question: []dns.Question{{Name: "peer.netbird.cloud.", Qtype: dns.TypeA, Qclass: dns.ClassINET}}})
require.NotNil(t, response, "local resolver should answer after teardown")
assert.Equal(t, dns.RcodeSuccess, response.Rcode,
"local resolver records must survive teardown; updateMux must not Stop() the shared resolver")
assert.NotEmpty(t, response.Answer, "answer should contain the surviving record")
}
func TestExtraDomains(t *testing.T) {
tests := []struct {
name string
@@ -2136,6 +2049,7 @@ func TestBuildUpstreamHandler_MergesGroupsPerDomain(t *testing.T) {
localResolver: local.NewResolver(),
handlerChain: NewHandlerChain(),
hostManager: &noopHostConfigurator{},
dnsMuxMap: make(registeredHandlerMap),
}
groups := []*nbdns.NameServerGroup{
@@ -2293,7 +2207,7 @@ func TestEvaluateNSGroupHealth(t *testing.T) {
}
}
// healthStubHandler is a minimal dnsMuxHandlers entry that exposes a fixed
// healthStubHandler is a minimal dnsMuxMap entry that exposes a fixed
// UpstreamHealth snapshot, letting tests drive recomputeNSGroupStates
// without spinning up real handlers.
type healthStubHandler struct {
@@ -2369,11 +2283,12 @@ func newProjTestFixture(t *testing.T) *projTestFixture {
ctx: context.Background(),
wgInterface: &mocWGIface{},
statusRecorder: recorder,
dnsMuxMap: make(registeredHandlerMap),
selectedRoutes: func() route.HAMap { return fx.selected },
activeRoutes: func() route.HAMap { return fx.active },
warningDelayBase: defaultWarningDelayBase,
}
fx.server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}}
fx.server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}
fx.server.mux.Lock()
fx.server.updateNSGroupStates([]*nbdns.NameServerGroup{fx.group})
@@ -2480,6 +2395,7 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
ctx: context.Background(),
wgInterface: &mocWGIface{},
statusRecorder: recorder,
dnsMuxMap: make(registeredHandlerMap),
selectedRoutes: func() route.HAMap { return nil },
activeRoutes: func() route.HAMap { return nil },
warningDelayBase: 50 * time.Millisecond,
@@ -2491,7 +2407,7 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{
overlayPeer: {LastFail: time.Now(), LastErr: "timeout"},
}}
server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
server.mux.Lock()
server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2528,6 +2444,7 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
service: NewServiceViaMemory(wgIface),
hostManager: &noopHostConfigurator{},
extraDomains: map[domain.Domain]int{},
dnsMuxMap: make(registeredHandlerMap),
statusRecorder: peer.NewRecorder("mgm"),
selectedRoutes: func() route.HAMap { return nil },
activeRoutes: func() route.HAMap { return nil },
@@ -2542,7 +2459,7 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
NameServers: []nbdns.NameServer{{IP: srv.Addr(), NSType: nbdns.UDPNameServerType, Port: int(srv.Port())}},
}
stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{srv: {LastOk: time.Now()}}}
server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
server.mux.Lock()
server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2567,32 +2484,6 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
// rule 3: startup failures while the peer is handshaking, then the peer
// comes up and a query succeeds before the grace window elapses. No
// warning should ever have fired, and no recovery either.
func TestWarningDelayBaseFromEnv(t *testing.T) {
tests := []struct {
name string
set bool
val string
want time.Duration
}{
{name: "unset uses default", set: false, want: defaultWarningDelayBase},
{name: "valid override", set: true, val: "90s", want: 90 * time.Second},
{name: "valid minutes", set: true, val: "2m", want: 2 * time.Minute},
{name: "invalid falls back", set: true, val: "notaduration", want: defaultWarningDelayBase},
{name: "zero falls back", set: true, val: "0s", want: defaultWarningDelayBase},
{name: "negative falls back", set: true, val: "-30s", want: defaultWarningDelayBase},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Setenv(envWarningDelay, tc.val)
if !tc.set {
os.Unsetenv(envWarningDelay)
}
assert.Equal(t, tc.want, warningDelayBaseFromEnv(), "grace window base")
})
}
}
func TestProjection_OverlayRecoversDuringGrace(t *testing.T) {
fx := newProjTestFixture(t)
fx.server.warningDelayBase = 200 * time.Millisecond
@@ -2704,6 +2595,7 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
server := &DefaultServer{
ctx: context.Background(),
statusRecorder: recorder,
dnsMuxMap: make(registeredHandlerMap),
selectedRoutes: func() route.HAMap { return overlayMap },
activeRoutes: func() route.HAMap { return nil },
warningDelayBase: time.Hour,
@@ -2721,7 +2613,7 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
overlay: {LastFail: time.Now(), LastErr: "timeout"},
},
}
server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
server.mux.Lock()
server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2748,6 +2640,7 @@ func TestDNSLoopPrevention(t *testing.T) {
localResolver: local.NewResolver(),
handlerChain: NewHandlerChain(),
hostManager: &noopHostConfigurator{},
dnsMuxMap: make(registeredHandlerMap),
}
tests := []struct {

View File

@@ -443,32 +443,29 @@ func (u *upstreamResolverBase) queryUpstream(parentCtx context.Context, r *dns.M
return raceResult{}, &upstreamFailure{upstream: upstream, reason: "no response"}
}
// A valid response means the upstream is reachable, whatever the Rcode.
u.markUpstreamOk(upstream)
proto := ""
if upstreamProto != nil {
proto = upstreamProto.protocol
}
if rm.Rcode == dns.RcodeServerFailure || rm.Rcode == dns.RcodeRefused {
// SERVFAIL and REFUSED are per-question outcomes (DNSSEC-bogus names,
// refused zones, transient recursion errors), not reachability
// problems: fail over for a better answer but keep the upstream healthy.
if code, ok := nonRetryableEDE(rm); ok {
if !hadEdns {
resutil.StripOPT(rm)
stripOPT(rm)
}
u.markUpstreamOk(upstream)
return raceResult{msg: rm, upstream: upstream, protocol: proto, ede: edeName(code)}, nil
}
reason := dns.RcodeToString[rm.Rcode]
u.markUpstreamFail(upstream, reason)
return raceResult{}, &upstreamFailure{upstream: upstream, reason: reason}
}
if !hadEdns {
resutil.StripOPT(rm)
stripOPT(rm)
}
u.markUpstreamOk(upstream)
return raceResult{msg: rm, upstream: upstream, protocol: proto}, nil
}
@@ -523,6 +520,22 @@ func upstreamUDPSize() uint16 {
return dns.MinMsgSize
}
// stripOPT removes any OPT pseudo-RRs from the response's Extra section so
// the response complies with RFC 6891 when the client did not advertise EDNS0.
func stripOPT(rm *dns.Msg) {
if len(rm.Extra) == 0 {
return
}
out := rm.Extra[:0]
for _, rr := range rm.Extra {
if _, ok := rr.(*dns.OPT); ok {
continue
}
out = append(out, rr)
}
rm.Extra = out
}
func (u *upstreamResolverBase) handleUpstreamError(err error, upstream netip.AddrPort, startTime time.Time) *upstreamFailure {
if !errors.Is(err, context.DeadlineExceeded) && !isTimeout(err) {
return &upstreamFailure{upstream: upstream, reason: err.Error()}

View File

@@ -517,78 +517,6 @@ func TestUpstreamResolver_HealthTracking(t *testing.T) {
assert.NotContains(t, health, bad, "sibling upstream should not be queried when primary answers")
}
// TestUpstreamResolver_HealthTracking_ResponseMeansReachable verifies that an
// upstream which answers with SERVFAIL or REFUSED is recorded as healthy:
// those are per-question outcomes from a reachable server and must not mark
// the upstream unhealthy. Only transport failures (timeouts) do.
func TestUpstreamResolver_HealthTracking_ResponseMeansReachable(t *testing.T) {
a := netip.MustParseAddrPort("192.0.2.10:53")
b := netip.MustParseAddrPort("192.0.2.11:53")
timeoutErr := &net.OpError{Op: "read", Err: fmt.Errorf("i/o timeout")}
tests := []struct {
name string
respA mockUpstreamResponse
respB mockUpstreamResponse
wantHealthy bool
}{
{
name: "both SERVFAIL are reachable",
respA: mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
respB: mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
wantHealthy: true,
},
{
name: "both REFUSED are reachable",
respA: mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
respB: mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
wantHealthy: true,
},
{
name: "timeout marks unhealthy",
respA: mockUpstreamResponse{err: timeoutErr},
respB: mockUpstreamResponse{err: timeoutErr},
wantHealthy: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
mockClient := &mockUpstreamResolverPerServer{
responses: map[string]mockUpstreamResponse{
a.String(): tc.respA,
b.String(): tc.respB,
},
rtt: time.Millisecond,
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
resolver := &upstreamResolverBase{
ctx: ctx,
upstreamClient: mockClient,
upstreamTimeout: UpstreamTimeout,
}
resolver.addRace([]netip.AddrPort{a, b})
responseWriter := &test.MockResponseWriter{WriteMsgFunc: func(m *dns.Msg) error { return nil }}
resolver.ServeDNS(responseWriter, new(dns.Msg).SetQuestion("example.com.", dns.TypeA))
health := resolver.UpstreamHealth()
require.Contains(t, health, a, "primary upstream should have a health record")
if tc.wantHealthy {
assert.False(t, health[a].LastOk.IsZero(), "responding upstream should have LastOk set")
assert.True(t, health[a].LastFail.IsZero(), "responding upstream should not be marked failed")
assert.Empty(t, health[a].LastErr, "responding upstream should have no error")
} else {
assert.False(t, health[a].LastFail.IsZero(), "timed-out upstream should be marked failed")
assert.NotEmpty(t, health[a].LastErr, "timed-out upstream should record an error")
}
})
}
}
func TestFormatFailures(t *testing.T) {
testCases := []struct {
name string
@@ -985,6 +913,19 @@ func TestEDEName(t *testing.T) {
assert.Equal(t, "EDE 9999", edeName(9999), "unknown code falls back to numeric")
}
func TestStripOPT(t *testing.T) {
rm := &dns.Msg{
Extra: []dns.RR{
&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
},
}
stripOPT(rm)
assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
_, isOPT := rm.Extra[0].(*dns.OPT)
assert.False(t, isOPT, "remaining record must not be OPT")
}
func TestUpstreamResolver_NonRetryableEDEShortCircuits(t *testing.T) {
upstream1 := netip.MustParseAddrPort("192.0.2.1:53")
upstream2 := netip.MustParseAddrPort("192.0.2.2:53")

View File

@@ -26,15 +26,6 @@ import (
const errResolveFailed = "failed to resolve query for domain=%s: %v"
const upstreamTimeout = 15 * time.Second
// EDE info codes the forwarder emits on upstream failures so the querying
// client can see the reason without inspecting this peer's logs. They live in
// the RFC 8914 Private Use range (49152-65535); the Go resolver never exposes a
// real upstream EDE here, so these cannot collide with a genuine code.
const (
edeNetbirdUpstreamTimeout uint16 = 49152
edeNetbirdUpstreamFailure uint16 = 49153
)
type resolver interface {
LookupNetIP(ctx context.Context, network, host string) ([]netip.Addr, error)
}
@@ -229,7 +220,7 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q
result := resutil.LookupIP(ctx, f.resolver, network, qname, question.Qtype)
if result.Err != nil {
f.handleDNSError(ctx, logger, w, question, resp, qname, result, query.IsEdns0() != nil, startTime)
f.handleDNSError(ctx, logger, w, question, resp, qname, result, startTime)
return
}
@@ -342,7 +333,6 @@ func (f *DNSForwarder) handleDNSError(
resp *dns.Msg,
domain string,
result resutil.LookupResult,
reqHasEdns bool,
startTime time.Time,
) {
qType := question.Qtype
@@ -384,10 +374,6 @@ func (f *DNSForwarder) handleDNSError(
logger.Warnf(errResolveFailed, domain, result.Err)
}
if reqHasEdns {
attachEDE(resp, edeCodeFor(dnsErr), edeText(dnsErr))
}
f.writeResponse(logger, w, resp, domain, startTime)
}
@@ -428,33 +414,3 @@ func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*Forwar
return selectedResId, matches
}
// edeCodeFor maps an upstream lookup error to the NetBird EDE info code.
func edeCodeFor(dnsErr *net.DNSError) uint16 {
if dnsErr != nil && dnsErr.IsTimeout {
return edeNetbirdUpstreamTimeout
}
return edeNetbirdUpstreamFailure
}
// edeText builds the EDE extra-text describing the class of upstream failure.
// It deliberately omits the upstream server address, which may be an internal
// resolver and is exposed to any client permitted to use the route; the full
// detail stays in the forwarder's local log.
func edeText(dnsErr *net.DNSError) string {
if dnsErr != nil && dnsErr.IsTimeout {
return "netbird forwarder: upstream timeout"
}
return "netbird forwarder: upstream failure"
}
// attachEDE adds an Extended DNS Error (RFC 8914) option to the response,
// creating the OPT pseudo-record if the response does not already carry one.
func attachEDE(resp *dns.Msg, code uint16, text string) {
opt := resp.IsEdns0()
if opt == nil {
resp.SetEdns0(dns.DefaultMsgSize, false)
opt = resp.IsEdns0()
}
opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: code, ExtraText: text})
}

View File

@@ -16,7 +16,6 @@ import (
"github.com/stretchr/testify/require"
firewall "github.com/netbirdio/netbird/client/firewall/manager"
"github.com/netbirdio/netbird/client/internal/dns/resutil"
"github.com/netbirdio/netbird/client/internal/dns/test"
"github.com/netbirdio/netbird/client/internal/peer"
"github.com/netbirdio/netbird/route"
@@ -618,85 +617,6 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
}
}
func TestDNSForwarder_UpstreamFailureEDE(t *testing.T) {
tests := []struct {
name string
lookupErr error
reqEdns bool
wantEDE bool
wantCode uint16
wantTextHas string
}{
{
name: "timeout with edns0",
lookupErr: &net.DNSError{Err: "i/o timeout", Server: "10.0.0.53:53", IsTimeout: true},
reqEdns: true,
wantEDE: true,
wantCode: edeNetbirdUpstreamTimeout,
wantTextHas: "netbird forwarder: upstream timeout",
},
{
name: "server failure with edns0",
lookupErr: &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
reqEdns: true,
wantEDE: true,
wantCode: edeNetbirdUpstreamFailure,
wantTextHas: "netbird forwarder: upstream failure",
},
{
name: "no edns0 in request omits ede",
lookupErr: &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
reqEdns: false,
wantEDE: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mockResolver := &MockResolver{}
forwarder := NewDNSForwarder(netip.MustParseAddrPort("127.0.0.1:0"), 300, nil, &peer.Status{}, nil)
forwarder.resolver = mockResolver
d, err := domain.FromString("example.com")
require.NoError(t, err)
forwarder.UpdateDomains([]*ForwarderEntry{{Domain: d, ResID: "test-res"}})
mockResolver.On("LookupNetIP", mock.Anything, "ip4", "example.com.").
Return([]netip.Addr(nil), tt.lookupErr).Once()
query := &dns.Msg{}
query.SetQuestion("example.com.", dns.TypeA)
if tt.reqEdns {
query.SetEdns0(dns.DefaultMsgSize, false)
}
var writtenResp *dns.Msg
mockWriter := &test.MockResponseWriter{
WriteMsgFunc: func(m *dns.Msg) error {
writtenResp = m
return nil
},
}
forwarder.handleDNSQuery(log.NewEntry(log.StandardLogger()), mockWriter, query, time.Now())
mockResolver.AssertExpectations(t)
require.NotNil(t, writtenResp, "expected a response")
assert.Equal(t, dns.RcodeServerFailure, writtenResp.Rcode, "upstream failure must be SERVFAIL")
ede, ok := resutil.ExtractEDE(writtenResp)
if !tt.wantEDE {
assert.False(t, ok, "response must not carry EDE")
return
}
require.True(t, ok, "response must carry EDE")
assert.Equal(t, tt.wantCode, ede.InfoCode, "EDE info code")
assert.Contains(t, ede.ExtraText, tt.wantTextHas, "EDE extra-text")
assert.NotContains(t, ede.ExtraText, "10.0.0.53", "must not leak upstream server address")
})
}
}
func TestDNSForwarder_TCPTruncation(t *testing.T) {
// Test that large UDP responses are truncated with TC bit set
mockResolver := &MockResolver{}

View File

@@ -86,8 +86,6 @@ const (
var ErrResetConnection = fmt.Errorf("reset connection")
var ErrEngineAlreadyStarted = errors.New("engine already started")
type EngineConfig struct {
WgPort int
WgIfaceName string
@@ -201,8 +199,6 @@ type Engine struct {
ctx context.Context
cancel context.CancelFunc
started bool
wgInterface WGIface
udpMux *udpmux.UniversalUDPMuxDefault
@@ -283,15 +279,9 @@ func NewEngine(
services EngineServices,
mobileDep MobileDependency,
) *Engine {
// The engine is single-use: a fresh instance is built per connection
// cycle (see Client.run), so the run context is created once here rather
// than in Start.
ctx, cancel := context.WithCancel(clientCtx)
engine := &Engine{
clientCtx: clientCtx,
clientCancel: clientCancel,
ctx: ctx,
cancel: cancel,
signal: services.SignalClient,
signaler: peer.NewSignaler(services.SignalClient, config.WgPrivateKey),
mgmClient: services.MgmClient,
@@ -324,34 +314,8 @@ func (e *Engine) Stop() error {
log.Debugf("tried stopping engine that is nil")
return nil
}
e.cancel()
e.syncMsgMux.Lock()
e.stopLocked()
e.syncMsgMux.Unlock()
timeout := e.calculateShutdownTimeout()
log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
}
log.Infof("stopped Netbird Engine")
return nil
}
// stopLocked tears down everything Start may have brought up, in the order
// teardown requires (DNS before the interface goes down, flow manager after).
// The caller must hold syncMsgMux. It is shared by Stop and by Start's failure
// path, so a partially-initialized engine is cleaned up the same way; every
// step is nil-guarded. It does not wait on shutdownWg — the caller does that
// after releasing the lock, since the goroutines also take syncMsgMux.
func (e *Engine) stopLocked() {
if e.connMgr != nil {
e.connMgr.Close()
}
@@ -402,6 +366,10 @@ func (e *Engine) stopLocked() {
// so dbus and friends don't complain because of a missing interface
e.stopDNSServer()
if e.cancel != nil {
e.cancel()
}
e.jobExecutorWG.Wait() // block until job goroutines finish
e.close()
@@ -420,6 +388,21 @@ func (e *Engine) stopLocked() {
if err := e.stateManager.PersistState(context.Background()); err != nil {
log.Errorf("failed to persist state: %v", err)
}
e.syncMsgMux.Unlock()
timeout := e.calculateShutdownTimeout()
log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
}
log.Infof("stopped Netbird Engine")
return nil
}
// calculateShutdownTimeout returns shutdown timeout: 10s base + 100ms per peer, capped at 30s.
@@ -457,38 +440,18 @@ func waitWithContext(ctx context.Context, wg *sync.WaitGroup) error {
// Start creates a new WireGuard tunnel interface and listens to events from Signal and Management services
// Connections to remote peers are not established here.
// However, they will be established once an event with a list of peers to connect to will be received from Management Service
func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) (err error) {
func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) error {
e.syncMsgMux.Lock()
defer e.syncMsgMux.Unlock()
// The engine is single-use. Reject a duplicate start and a start on an
// already-stopped engine (run context cancelled).
if e.started {
return ErrEngineAlreadyStarted
}
if ctxErr := e.ctx.Err(); ctxErr != nil {
return fmt.Errorf("engine already stopped: %w", ctxErr)
}
e.started = true
// Tear down any partially-initialized state on a failed start. Cancel the
// run context first so goroutines started before the failure (connMgr,
// srWatcher, monitors) unwind, then stopLocked mirrors Stop's teardown (we
// already hold syncMsgMux), cleaning up route/DNS/flow/state managers too,
// not just what close() covers.
defer func() {
if err != nil {
e.cancel()
e.stopLocked()
}
}()
if err = iface.ValidateMTU(e.config.MTU); err != nil {
if err := iface.ValidateMTU(e.config.MTU); err != nil {
return fmt.Errorf("invalid MTU configuration: %w", err)
}
if e.cancel != nil {
e.cancel()
}
e.ctx, e.cancel = context.WithCancel(e.clientCtx)
e.exposeManager = expose.NewManager(e.ctx, e.mgmClient)
wgIface, err := e.newWgIface()
@@ -522,11 +485,13 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
initialRoutes, dnsConfig, dnsFeatureFlag, err := e.readInitialSettings()
if err != nil {
e.close()
return fmt.Errorf("read initial settings: %w", err)
}
dnsServer, err := e.newDnsServer(dnsConfig)
if err != nil {
e.close()
return fmt.Errorf("create dns server: %w", err)
}
e.dnsServer = dnsServer
@@ -561,6 +526,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
if err = e.wgInterfaceCreate(); err != nil {
log.Errorf("failed creating tunnel interface %s: [%s]", e.config.WgIfaceName, err.Error())
e.close()
return fmt.Errorf("create wg interface: %w", err)
}
@@ -569,6 +535,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
}
if err := e.createFirewall(); err != nil {
e.close()
return err
}
@@ -580,6 +547,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
e.udpMux, err = e.wgInterface.Up()
if err != nil {
log.Errorf("failed to pull up wgInterface [%s]: %s", e.wgInterface.Name(), err.Error())
e.close()
return fmt.Errorf("up wg interface: %w", err)
}
@@ -604,7 +572,9 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
e.acl = acl.NewDefaultManager(e.firewall)
}
if err := e.dnsServer.Initialize(); err != nil {
err = e.dnsServer.Initialize()
if err != nil {
e.close()
return fmt.Errorf("initialize dns server: %w", err)
}
@@ -616,9 +586,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
e.srWatcher = guard.NewSRWatcher(e.signal, e.relayManager, e.mobileDep.IFaceDiscover, iceCfg)
e.srWatcher.Start(peer.IsForceRelayed())
if err = e.receiveSignalEvents(); err != nil {
return err
}
e.receiveSignalEvents()
e.receiveManagementEvents()
e.receiveJobEvents()
@@ -670,6 +638,7 @@ func (e *Engine) createFirewall() error {
func (e *Engine) initFirewall() error {
if err := e.routeManager.SetFirewall(e.firewall); err != nil {
e.close()
return fmt.Errorf("set firewall: %w", err)
}
@@ -1729,7 +1698,7 @@ func (e *Engine) createPeerConn(pubKey string, allowedIPs []netip.Prefix, agentV
}
// receiveSignalEvents connects to the Signal Service event stream to negotiate connection with remote peers
func (e *Engine) receiveSignalEvents() error {
func (e *Engine) receiveSignalEvents() {
e.shutdownWg.Add(1)
go func() {
defer e.shutdownWg.Done()
@@ -1745,13 +1714,6 @@ func (e *Engine) receiveSignalEvents() error {
return e.ctx.Err()
}
// Self-addressed heartbeat: the signal client's receive watchdog
// round-trips this through the server to confirm the receive stream
// is delivering. Liveness is already recorded before this handler.
if msg.GetBody().GetType() == sProto.Body_HEARTBEAT {
return nil
}
conn, ok := e.peerStore.PeerConn(msg.Key)
if !ok {
return fmt.Errorf("wrongly addressed message %s", msg.Key)
@@ -1800,12 +1762,7 @@ func (e *Engine) receiveSignalEvents() error {
}
}()
// todo: consider to remove this blocker. I do not see benefit to block the Start operations
e.signal.WaitStreamConnected(e.ctx)
if err := e.ctx.Err(); err != nil {
return fmt.Errorf("wait for signal stream: %w", err)
}
return nil
e.signal.WaitStreamConnected()
}
func (e *Engine) parseNATExternalIPMappings() []string {

View File

@@ -247,7 +247,7 @@ func TestEngine_SSH(t *testing.T) {
return
}
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
@@ -426,7 +426,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
return
}
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
@@ -638,7 +638,7 @@ func TestEngine_Sync(t *testing.T) {
return
}
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// feed updates to Engine via mocked Management client
@@ -817,7 +817,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
return
}
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
wgIfaceName := fmt.Sprintf("utun%d", 104+n)
@@ -1024,7 +1024,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
return
}
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
wgIfaceName := fmt.Sprintf("utun%d", 104+n)

View File

@@ -1024,17 +1024,14 @@ func (d *Status) GetRelayStates() []relay.ProbeResult {
return d.relayStates
}
// extend the list of stun, turn servers with the relay server connections
// extend the list of stun, turn servers with relay address
relayStates := slices.Clone(d.relayStates)
states := d.relayMgr.RelayStates()
if len(states) == 0 {
// no relay connection tracked yet; surface configured servers as
// unavailable with the real reconnect error when known
err := relayClient.ErrRelayClientNotConnected
if connErr := d.relayMgr.RelayConnectError(); connErr != nil {
err = connErr
}
// if the server connection is not established then we will use the general address
// in case of connection we will use the instance specific address
instanceAddr, _, err := d.relayMgr.RelayInstanceAddress()
if err != nil {
// TODO add their status
for _, r := range d.relayMgr.ServerURLs() {
relayStates = append(relayStates, relay.ProbeResult{
URI: r,
@@ -1044,14 +1041,10 @@ func (d *Status) GetRelayStates() []relay.ProbeResult {
return relayStates
}
for _, rs := range states {
relayStates = append(relayStates, relay.ProbeResult{
URI: rs.URL,
Err: rs.Err,
Transport: rs.Transport,
})
relayState := relay.ProbeResult{
URI: instanceAddr,
}
return relayStates
return append(relayStates, relayState)
}
func (d *Status) ForwardingRules() []firewall.ForwardRule {
@@ -1412,7 +1405,6 @@ func (fs FullStatus) ToProto() *proto.FullStatus {
pbRelayState := &proto.RelayState{
URI: relayState.URI,
Available: relayState.Err == nil,
Transport: relayState.Transport,
}
if err := relayState.Err; err != nil {
pbRelayState.Error = err.Error()

View File

@@ -108,10 +108,6 @@ type ConfigInput struct {
// Config Configuration type
type Config struct {
// Name is the human-readable profile name shown in CLI/UI listings.
// It is independent of the profile's on-disk filename (which is the ID).
Name string
// Wireguard private key of local peer
PrivateKey string
PreSharedKey string
@@ -274,16 +270,6 @@ func createNewConfig(input ConfigInput) (*Config, error) {
}
func (config *Config) apply(input ConfigInput) (updated bool, err error) {
if config.Name != "" {
sanitized, err := sanitizeDisplayName(config.Name)
if err != nil {
return false, fmt.Errorf("invalid profile name: %w", err)
}
if sanitized != config.Name {
config.Name = sanitized
updated = true
}
}
if config.ManagementURL == nil {
log.Infof("using default Management URL %s", DefaultManagementURL)
config.ManagementURL, err = parseURL("Management URL", DefaultManagementURL)

View File

@@ -1,118 +0,0 @@
package profilemanager
import (
"crypto/rand"
"encoding/hex"
"fmt"
"path/filepath"
"strings"
"unicode"
"unicode/utf8"
)
const (
// profileIDByteLen is the number of random bytes generated for a new
// profile ID. The resulting hex string is twice this length.
profileIDByteLen = 16
// shortIDLen is the number of leading characters of an ID we render in
// list output. Profiles per device are few, so 8 chars is collision-safe
// in practice and easy to type as a prefix.
shortIDLen = 8
// maxProfileNameLen caps the human-readable profile name to keep table
// output legible and prevent denial-of-service via huge JSON fields.
maxProfileNameLen = 128
// maxProfileIDLen bounds the on-disk filename we'll accept. New
// IDs are 32 hex chars, legacy stems are sanitized profile names. The
// cap is generous enough to cover both without permitting absurdly
// long filenames.
maxProfileIDLen = 64
)
type ID string
// generateProfileID returns a new random hex ID for a profile file.
func generateProfileID() (ID, error) {
buf := make([]byte, profileIDByteLen)
if _, err := rand.Read(buf); err != nil {
return "", fmt.Errorf("read random bytes: %w", err)
}
return ID(hex.EncodeToString(buf)), nil
}
// IsValidProfileFilenameStem reports whether id is safe to use as the stem
// of a profile JSON filename.
func IsValidProfileFilenameStem(id ID) bool {
s := id.String()
if s == "" || len(s) > maxProfileIDLen {
return false
}
if s == defaultProfileName {
return true
}
if strings.ContainsAny(s, `/\`) || strings.Contains(s, "..") {
return false
}
// filepath.Base catches any leftover separators on platforms with
// exotic path conventions.
if filepath.Base(s) != s {
return false
}
for _, r := range s {
if !(unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '-') {
return false
}
}
return true
}
// sanitizeDisplayName normalizes a user-supplied profile display name for
// storage. It strips ASCII control characters, rejects invalid UTF-8, and
// caps the length. Emojis, spaces, punctuation, and non-ASCII letters are
// preserved. Returns an error if nothing usable remains.
func sanitizeDisplayName(name string) (string, error) {
if !utf8.ValidString(name) {
return "", fmt.Errorf("name is not valid UTF-8")
}
name = StripCtrlChars(name)
name = strings.TrimSpace(name)
if name == "" {
return "", fmt.Errorf("name is empty after sanitization")
}
if utf8.RuneCountInString(name) > maxProfileNameLen {
return "", fmt.Errorf("name exceeds %d characters", maxProfileNameLen)
}
return name, nil
}
// StripCtrlChars control characters from a name before printing it.
func StripCtrlChars(name string) string {
var b strings.Builder
b.Grow(len(name))
for _, r := range name {
// Skip C0 controls and DEL, plus C1 controls (0x800x9F).
if r < 0x20 || r == 0x7F || (r >= 0x80 && r <= 0x9F) {
continue
}
b.WriteRune(r)
}
return b.String()
}
// ShortID truncates an ID for display.
func (id ID) ShortID() string {
if id == DefaultProfileName {
return DefaultProfileName
}
runes := []rune(id)
if len(runes) <= shortIDLen {
return id.String()
}
return string(runes[:shortIDLen])
}
func (id ID) String() string {
return string(id)
}

View File

@@ -19,41 +19,19 @@ const (
)
type Profile struct {
// ID is the on-disk filename stem (without .json). For new profiles
// it is a 32-char hex string; legacy profiles created before the
// ID-keyed layout keep their original name as their ID. The reserved
// value "default" identifies the special default profile.
ID ID
// Name is the human-readable display name. Falls back to ID when the
// underlying JSON has no "name" field set.
Name string
// Path is the absolute path to the profile JSON. Populated by the
// loader so callers do not have to reconstruct it from ID + dir.
Path string
Name string
IsActive bool
}
func (p *Profile) FilePath() (string, error) {
if p.Path != "" {
return p.Path, nil
if p.Name == "" {
return "", fmt.Errorf("active profile name is empty")
}
id := p.ID
if id == "" {
id = ID(p.Name)
}
if id == "" {
return "", fmt.Errorf("profile ID is empty")
}
if id == defaultProfileName {
if p.Name == defaultProfileName {
return DefaultConfigPath, nil
}
if !IsValidProfileFilenameStem(id) {
return "", fmt.Errorf("invalid profile ID: %q", id)
}
username, err := user.Current()
if err != nil {
return "", fmt.Errorf("failed to get current user: %w", err)
@@ -64,13 +42,10 @@ func (p *Profile) FilePath() (string, error) {
return "", fmt.Errorf("failed to get config directory for user %s: %w", username.Username, err)
}
return filepath.Join(configDir, id.String()+".json"), nil
return filepath.Join(configDir, p.Name+".json"), nil
}
func (p *Profile) IsDefault() bool {
if p.ID != "" {
return p.ID == defaultProfileName
}
return p.Name == defaultProfileName
}
@@ -82,24 +57,18 @@ func NewProfileManager() *ProfileManager {
return &ProfileManager{}
}
// GetActiveProfile returns the active profile as recorded in the local
// user state file. Only ID is populated.
func (pm *ProfileManager) GetActiveProfile() (*Profile, error) {
pm.mu.Lock()
defer pm.mu.Unlock()
id := pm.getActiveProfileState()
return &Profile{ID: id}, nil
prof := pm.getActiveProfileState()
return &Profile{Name: prof}, nil
}
// SwitchProfile records the given profile ID as active in the local user
// state file.
func (pm *ProfileManager) SwitchProfile(id ID) error {
if id != defaultProfileName && !IsValidProfileFilenameStem(id) {
return fmt.Errorf("invalid profile ID: %q", id)
}
func (pm *ProfileManager) SwitchProfile(profileName string) error {
profileName = sanitizeProfileName(profileName)
if err := pm.setActiveProfileState(id); err != nil {
if err := pm.setActiveProfileState(profileName); err != nil {
return fmt.Errorf("failed to switch profile: %w", err)
}
return nil
@@ -116,7 +85,7 @@ func sanitizeProfileName(name string) string {
}, name)
}
func (pm *ProfileManager) getActiveProfileState() ID {
func (pm *ProfileManager) getActiveProfileState() string {
configDir, err := getConfigDir()
if err != nil {
@@ -144,10 +113,10 @@ func (pm *ProfileManager) getActiveProfileState() ID {
return defaultProfileName
}
return ID(profileName)
return profileName
}
func (pm *ProfileManager) setActiveProfileState(id ID) error {
func (pm *ProfileManager) setActiveProfileState(profileName string) error {
configDir, err := getConfigDir()
if err != nil {
@@ -156,7 +125,7 @@ func (pm *ProfileManager) setActiveProfileState(id ID) error {
statePath := filepath.Join(configDir, activeProfileStateFilename)
err = os.WriteFile(statePath, []byte(id), 0600)
err = os.WriteFile(statePath, []byte(profileName), 0600)
if err != nil {
return fmt.Errorf("failed to write active profile state: %w", err)
}
@@ -173,7 +142,7 @@ func GetLoginHint() string {
return ""
}
profileState, err := pm.GetProfileState(activeProf.ID)
profileState, err := pm.GetProfileState(activeProf.Name)
if err != nil {
log.Debugf("failed to get profile state for login hint: %v", err)
return ""

View File

@@ -50,14 +50,14 @@ func TestServiceManager_CreateAndGetDefaultProfile(t *testing.T) {
state, err := sm.GetActiveProfileState()
assert.NoError(t, err)
assert.Equal(t, defaultProfileName, state.ID.String()) // No active profile state yet
assert.Equal(t, state.Name, defaultProfileName) // No active profile state yet
err = sm.SetActiveProfileStateToDefault()
assert.NoError(t, err)
active, err := sm.GetActiveProfileState()
assert.NoError(t, err)
assert.Equal(t, "default", active.ID.String())
assert.Equal(t, "default", active.Name)
})
})
}
@@ -92,14 +92,14 @@ func TestServiceManager_SetActiveProfileState(t *testing.T) {
currUser, err := user.Current()
assert.NoError(t, err)
sm := &ServiceManager{}
state := &ActiveProfileState{ID: "foo", Username: currUser.Username}
state := &ActiveProfileState{Name: "foo", Username: currUser.Username}
err = sm.SetActiveProfileState(state)
assert.NoError(t, err)
// Should error on nil or incomplete state
err = sm.SetActiveProfileState(nil)
assert.Error(t, err)
err = sm.SetActiveProfileState(&ActiveProfileState{ID: "", Username: ""})
err = sm.SetActiveProfileState(&ActiveProfileState{Name: "", Username: ""})
assert.Error(t, err)
})
})

View File

@@ -2,7 +2,6 @@ package profilemanager
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
@@ -24,43 +23,12 @@ var (
DefaultConfigPathDir = ""
DefaultConfigPath = ""
ActiveProfileStatePath = ""
)
var (
ErrorOldDefaultConfigNotFound = errors.New("old default config not found")
)
// ErrAmbiguousHandle is returned when a profile handle (ID prefix or name)
// matches more than one profile. Callers can render Candidates to help the
// user disambiguate.
type ErrAmbiguousHandle struct {
Handle string
Candidates []Profile
Kind AmbiguityKind
}
// AmbiguityKind describes which matcher produced the ambiguity, so callers
// can tailor the error message.
type AmbiguityKind int
const (
AmbiguityKindIDPrefix AmbiguityKind = iota
AmbiguityKindName
)
// profileMeta is the minimal slice of a profile JSON we need, so we avoid
// reading all fields
type profileMeta struct {
Name string
}
func (e *ErrAmbiguousHandle) Error() string {
switch e.Kind {
case AmbiguityKindIDPrefix:
return fmt.Sprintf("ID prefix %q is ambiguous (matches %d profiles)", e.Handle, len(e.Candidates))
default:
return fmt.Sprintf("name %q is ambiguous (%d profiles share this name)", e.Handle, len(e.Candidates))
}
}
func init() {
DefaultConfigPathDir = "/var/lib/netbird/"
@@ -86,34 +54,25 @@ func init() {
}
type ActiveProfileState struct {
// ID is the on-disk filename stem of the active profile. The JSON tag stays
// as "name" for backwards compatibility with active state files written
// before the ID-based config files. Legacy values were profile names, which
// were also the legacy filename stems, so they still resolve to the correct
// file on disk.
ID ID `json:"name"`
Name string `json:"name"`
Username string `json:"username"`
}
func (a *ActiveProfileState) FilePath() (string, error) {
if a.ID == "" {
return "", fmt.Errorf("active profile ID is empty")
if a.Name == "" {
return "", fmt.Errorf("active profile name is empty")
}
if a.ID == defaultProfileName {
if a.Name == defaultProfileName {
return DefaultConfigPath, nil
}
if !IsValidProfileFilenameStem(a.ID) {
return "", fmt.Errorf("invalid profile ID: %q", a.ID)
}
configDir, err := getConfigDirForUser(a.Username)
if err != nil {
return "", fmt.Errorf("failed to get config directory for user %s: %w", a.Username, err)
}
return filepath.Join(configDir, a.ID.String()+".json"), nil
return filepath.Join(configDir, a.Name+".json"), nil
}
type ServiceManager struct {
@@ -219,7 +178,7 @@ func (s *ServiceManager) GetActiveProfileState() (*ActiveProfileState, error) {
return nil, fmt.Errorf("failed to set active profile to default: %w", err)
}
return &ActiveProfileState{
ID: defaultProfileName,
Name: "default",
Username: "",
}, nil
} else {
@@ -227,12 +186,12 @@ func (s *ServiceManager) GetActiveProfileState() (*ActiveProfileState, error) {
}
}
if activeProfile.ID == "" {
if activeProfile.Name == "" {
if err := s.SetActiveProfileStateToDefault(); err != nil {
return nil, fmt.Errorf("failed to set active profile to default: %w", err)
}
return &ActiveProfileState{
ID: defaultProfileName,
Name: "default",
Username: "",
}, nil
}
@@ -257,29 +216,25 @@ func (s *ServiceManager) setDefaultActiveState() error {
}
func (s *ServiceManager) SetActiveProfileState(a *ActiveProfileState) error {
if a == nil || a.ID == "" {
if a == nil || a.Name == "" {
return errors.New("invalid active profile state")
}
if a.ID != defaultProfileName && a.Username == "" {
return fmt.Errorf("username must be set for non-default profiles, got: %s", a.ID)
}
if a.ID != defaultProfileName && !IsValidProfileFilenameStem(a.ID) {
return fmt.Errorf("invalid profile ID: %q", a.ID)
if a.Name != defaultProfileName && a.Username == "" {
return fmt.Errorf("username must be set for non-default profiles, got: %s", a.Name)
}
if err := util.WriteJsonWithRestrictedPermission(context.Background(), ActiveProfileStatePath, a); err != nil {
return fmt.Errorf("failed to write active profile state: %w", err)
}
log.Infof("active profile set to %s for %s", a.ID, a.Username)
log.Infof("active profile set to %s for %s", a.Name, a.Username)
return nil
}
func (s *ServiceManager) SetActiveProfileStateToDefault() error {
return s.SetActiveProfileState(&ActiveProfileState{
ID: defaultProfileName,
Name: "default",
Username: "",
})
}
@@ -288,117 +243,57 @@ func (s *ServiceManager) DefaultProfilePath() string {
return DefaultConfigPath
}
// AddProfile creates a new profile with a generated ID. The user-supplied
// displayName is stored inside the JSON's name field, the on-disk filename
// uses the generated ID.
//
// The returned Profile carries the freshly-generated ID so callers can
// show it to the user (and so the gRPC AddProfileResponse can include
// it).
func (s *ServiceManager) AddProfile(displayName, username string) (*Profile, error) {
func (s *ServiceManager) AddProfile(profileName, username string) error {
configDir, err := s.getConfigDir(username)
if err != nil {
return nil, fmt.Errorf("failed to get config directory: %w", err)
return fmt.Errorf("failed to get config directory: %w", err)
}
displayName, err = sanitizeDisplayName(displayName)
profileName = sanitizeProfileName(profileName)
if profileName == defaultProfileName {
return fmt.Errorf("cannot create profile with reserved name: %s", defaultProfileName)
}
profPath := filepath.Join(configDir, profileName+".json")
profileExists, err := fileExists(profPath)
if err != nil {
return nil, fmt.Errorf("invalid profile name: %w", err)
return fmt.Errorf("failed to check if profile exists: %w", err)
}
if profileExists {
return ErrProfileAlreadyExists
}
id, err := generateProfileID()
if err != nil {
return nil, fmt.Errorf("generate profile id: %w", err)
}
profPath := filepath.Join(configDir, id.String()+".json")
cfg, err := createNewConfig(ConfigInput{ConfigPath: profPath})
if err != nil {
return nil, fmt.Errorf("failed to create new config: %w", err)
}
cfg.Name = displayName
if err := util.WriteJson(context.Background(), profPath, cfg); err != nil {
return nil, fmt.Errorf("failed to write profile config: %w", err)
return fmt.Errorf("failed to create new config: %w", err)
}
return &Profile{
ID: id,
Name: displayName,
Path: profPath,
}, nil
}
func (s *ServiceManager) RenameProfile(id ID, username string, newName string) error {
displayName, err := sanitizeDisplayName(newName)
err = util.WriteJson(context.Background(), profPath, cfg)
if err != nil {
return fmt.Errorf("invalid profile name: %w", err)
return fmt.Errorf("failed to write profile config: %w", err)
}
if !IsValidProfileFilenameStem(id) {
return fmt.Errorf("invalid profile ID: %q", id)
}
profiles, err := s.loadAllProfiles(username)
if err != nil {
return fmt.Errorf("load profiles: %w", err)
}
var target *Profile
for i := range profiles {
if profiles[i].ID == id {
target = &profiles[i]
break
}
}
if target == nil {
return ErrProfileNotFound
}
data, err := os.ReadFile(target.Path)
if err != nil {
return err
}
var cfg Config
if err := json.Unmarshal(data, &cfg); err != nil {
return err
}
cfg.Name = displayName
if err := util.WriteJson(context.Background(), target.Path, cfg); err != nil {
return fmt.Errorf("failed to write profile name: %w", err)
}
return nil
}
// RemoveProfile deletes the profile identified by id. Callers must have
// already resolved any user-supplied handle to a concrete ID via
// ResolveProfile.
func (s *ServiceManager) RemoveProfile(id ID, username string) error {
if id == defaultProfileName {
defaultName := readProfileName(DefaultConfigPath)
if defaultName == "" {
defaultName = defaultProfileName
}
return fmt.Errorf("cannot remove default profile with name: %s", defaultName)
}
if !IsValidProfileFilenameStem(id) {
return fmt.Errorf("invalid profile ID: %q", id)
}
profiles, err := s.loadAllProfiles(username)
func (s *ServiceManager) RemoveProfile(profileName, username string) error {
configDir, err := s.getConfigDir(username)
if err != nil {
return fmt.Errorf("load profiles: %w", err)
return fmt.Errorf("failed to get config directory: %w", err)
}
var target *Profile
for i := range profiles {
if profiles[i].ID == id {
target = &profiles[i]
break
}
profileName = sanitizeProfileName(profileName)
if profileName == defaultProfileName {
return fmt.Errorf("cannot remove profile with reserved name: %s", defaultProfileName)
}
if target == nil {
profPath := filepath.Join(configDir, profileName+".json")
profileExists, err := fileExists(profPath)
if err != nil {
return fmt.Errorf("failed to check if profile exists: %w", err)
}
if !profileExists {
return ErrProfileNotFound
}
@@ -406,26 +301,57 @@ func (s *ServiceManager) RemoveProfile(id ID, username string) error {
if err != nil && !errors.Is(err, ErrNoActiveProfile) {
return fmt.Errorf("failed to get active profile: %w", err)
}
if activeProf != nil && activeProf.ID == id {
return fmt.Errorf("cannot remove active profile: %s", id)
if activeProf != nil && activeProf.Name == profileName {
return fmt.Errorf("cannot remove active profile: %s", profileName)
}
if err := util.RemoveJson(target.Path); err != nil {
err = util.RemoveJson(profPath)
if err != nil {
return fmt.Errorf("failed to remove profile config: %w", err)
}
stateFile := filepath.Join(filepath.Dir(target.Path), id.String()+".state.json")
if err := os.Remove(stateFile); err != nil && !os.IsNotExist(err) {
log.Warnf("failed to remove profile state file %s: %v", stateFile, err)
}
return nil
}
// ListProfiles returns every profile for the given user, including the
// default profile, with IsActive flags set.
func (s *ServiceManager) ListProfiles(username string) ([]Profile, error) {
return s.loadAllProfiles(username)
configDir, err := s.getConfigDir(username)
if err != nil {
return nil, fmt.Errorf("failed to get config directory: %w", err)
}
files, err := util.ListFiles(configDir, "*.json")
if err != nil {
return nil, fmt.Errorf("failed to list profile files: %w", err)
}
var filtered []string
for _, file := range files {
if strings.HasSuffix(file, "state.json") {
continue // skip state files
}
filtered = append(filtered, file)
}
sort.Strings(filtered)
var activeProfName string
activeProf, err := s.GetActiveProfileState()
if err == nil {
activeProfName = activeProf.Name
}
var profiles []Profile
// add default profile always
profiles = append(profiles, Profile{Name: defaultProfileName, IsActive: activeProfName == "" || activeProfName == defaultProfileName})
for _, file := range filtered {
profileName := strings.TrimSuffix(filepath.Base(file), ".json")
var isActive bool
if activeProfName != "" && activeProfName == profileName {
isActive = true
}
profiles = append(profiles, Profile{Name: profileName, IsActive: isActive})
}
return profiles, nil
}
// GetStatePath returns the path to the state file based on the operating system
@@ -443,12 +369,7 @@ func (s *ServiceManager) GetStatePath() string {
return defaultStatePath
}
if activeProf.ID == defaultProfileName {
return defaultStatePath
}
if !IsValidProfileFilenameStem(activeProf.ID) {
log.Warnf("invalid active profile ID %q, using default state path", activeProf.ID)
if activeProf.Name == defaultProfileName {
return defaultStatePath
}
@@ -458,7 +379,7 @@ func (s *ServiceManager) GetStatePath() string {
return defaultStatePath
}
return filepath.Join(configDir, activeProf.ID.String()+".state.json")
return filepath.Join(configDir, activeProf.Name+".state.json")
}
// getConfigDir returns the profiles directory, using profilesDir if set, otherwise getConfigDirForUser
@@ -469,169 +390,3 @@ func (s *ServiceManager) getConfigDir(username string) (string, error) {
return getConfigDirForUser(username)
}
// loadAllProfiles returns every profile visible to the daemon for the
// given user, including the default profile. The returned slice is sorted
// by ID for a stable display order.
//
// Each Profile is fully populated: ID is the filename stem, Name comes
// from the JSON's "name" field (falling back to the filename stem when absent)
// and Path is built from a basename read off disk.
func (s *ServiceManager) loadAllProfiles(username string) ([]Profile, error) {
activeID, activeIsDefault := s.activeProfileID()
defaultName := readProfileName(DefaultConfigPath)
if defaultName == "" {
defaultName = defaultProfileName
}
profiles := []Profile{{
ID: defaultProfileName,
Name: defaultName,
Path: DefaultConfigPath,
IsActive: activeIsDefault,
}}
configDir, err := s.getConfigDir(username)
if err != nil {
return nil, fmt.Errorf("get config directory: %w", err)
}
entries, err := os.ReadDir(configDir)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return profiles, nil
}
return nil, fmt.Errorf("read profile directory: %w", err)
}
var fileProfiles []Profile
for _, entry := range entries {
if entry.IsDir() {
continue
}
base := entry.Name()
if !strings.HasSuffix(base, ".json") {
continue
}
if strings.HasSuffix(base, ".state.json") {
continue
}
stem := ID(strings.TrimSuffix(base, ".json"))
if stem == defaultProfileName {
// default lives at the top-level config dir, not under /<user>
continue
}
if !IsValidProfileFilenameStem(ID(stem)) {
continue
}
path := filepath.Join(configDir, base)
name := readProfileName(path)
if name == "" {
name = stem.String()
}
fileProfiles = append(fileProfiles, Profile{
ID: stem,
Name: name,
Path: path,
IsActive: stem == ID(activeID),
})
}
sort.Slice(fileProfiles, func(i, j int) bool {
if fileProfiles[i].Name != fileProfiles[j].Name {
return fileProfiles[i].Name < fileProfiles[j].Name
}
// Sort tie-break on ID so duplicate names always render in the same order.
return fileProfiles[i].ID < fileProfiles[j].ID
})
profiles = append(profiles, fileProfiles...)
return profiles, nil
}
// readProfileName parses just the "name" field from the profile Json.
func readProfileName(path string) string {
data, err := os.ReadFile(path)
if err != nil {
return ""
}
var meta profileMeta
if err := json.Unmarshal(data, &meta); err != nil {
return ""
}
return meta.Name
}
// activeProfileID returns the currently-active profile's ID. The second
// return value is true when the active profile is the default one.
func (s *ServiceManager) activeProfileID() (ID, bool) {
state, err := s.GetActiveProfileState()
if err != nil || state == nil {
return defaultProfileName, true
}
if state.ID == "" || state.ID == defaultProfileName {
return defaultProfileName, true
}
return state.ID, false
}
// ResolveProfile turns a user-supplied handle into a Profile. Resolution
// precedence is: exact ID match, then unique exact name, then unique ID
// prefix. Ambiguous matches return *ErrAmbiguousHandle so callers can
// surface the candidates.
func (s *ServiceManager) ResolveProfile(handle, username string) (*Profile, error) {
if handle == "" {
return nil, fmt.Errorf("profile handle is empty")
}
profiles, err := s.loadAllProfiles(username)
if err != nil {
return nil, err
}
for i := range profiles {
if profiles[i].ID == ID(handle) {
return &profiles[i], nil
}
}
var nameMatches []Profile
for i := range profiles {
if profiles[i].Name == handle {
nameMatches = append(nameMatches, profiles[i])
}
}
if len(nameMatches) == 1 {
return &nameMatches[0], nil
}
if len(nameMatches) > 1 {
return nil, &ErrAmbiguousHandle{
Handle: handle,
Candidates: nameMatches,
Kind: AmbiguityKindName,
}
}
// ID prefix match. Skip the default profile so `select d` does not
// accidentally pick it via prefix.
var prefixMatches []Profile
for i := range profiles {
if profiles[i].ID == defaultProfileName {
continue
}
if strings.HasPrefix(profiles[i].ID.String(), handle) {
prefixMatches = append(prefixMatches, profiles[i])
}
}
if len(prefixMatches) == 1 {
return &prefixMatches[0], nil
}
if len(prefixMatches) > 1 {
return nil, &ErrAmbiguousHandle{
Handle: handle,
Candidates: prefixMatches,
Kind: AmbiguityKindIDPrefix,
}
}
return nil, ErrProfileNotFound
}

View File

@@ -1,230 +0,0 @@
package profilemanager
import (
"context"
"errors"
"os"
"os/user"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/netbirdio/netbird/util"
)
// withTestSM wires up patched globals + a clean config dir and returns a
// fully initialized ServiceManager plus the username we are scoped to.
func withTestSM(t *testing.T, fn func(sm *ServiceManager, username string)) {
t.Helper()
withTempConfigDir(t, func(configDir string) {
withPatchedGlobals(t, configDir, func() {
u, err := user.Current()
require.NoError(t, err)
sm := &ServiceManager{}
require.NoError(t, sm.CreateDefaultProfile())
fn(sm, u.Username)
})
})
}
func TestServiceProfile_ExactID(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
created, err := sm.AddProfile("work", username)
require.NoError(t, err)
got, err := sm.ResolveProfile(created.ID.String(), username)
require.NoError(t, err)
assert.Equal(t, created.ID, got.ID)
assert.Equal(t, "work", got.Name)
})
}
func TestServiceProfile_IDPrefix(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
created, err := sm.AddProfile("work", username)
require.NoError(t, err)
prefix := created.ID[:4]
got, err := sm.ResolveProfile(prefix.String(), username)
require.NoError(t, err)
assert.Equal(t, created.ID, got.ID)
})
}
func TestServiceProfile_AmbiguousPrefix(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
// Plant two profiles whose IDs share a known prefix by writing
// the files directly, since generated IDs are random.
configDir, err := sm.getConfigDir(username)
require.NoError(t, err)
for _, id := range []string{"abcd1111aaaa", "abcd2222bbbb"} {
path := filepath.Join(configDir, id+".json")
require.NoError(t, util.WriteJson(context.Background(), path, &Config{Name: id}))
}
_, err = sm.ResolveProfile("abcd", username)
var amb *ErrAmbiguousHandle
require.ErrorAs(t, err, &amb)
assert.Equal(t, AmbiguityKindIDPrefix, amb.Kind)
assert.Len(t, amb.Candidates, 2)
})
}
func TestServiceProfile_ExactNameUnique(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
_, err := sm.AddProfile("work", username)
require.NoError(t, err)
got, err := sm.ResolveProfile("work", username)
require.NoError(t, err)
assert.Equal(t, "work", got.Name)
})
}
func TestServiceProfile_AmbiguousName(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
_, err := sm.AddProfile("work", username)
require.NoError(t, err)
_, err = sm.AddProfile("work", username)
require.NoError(t, err)
_, err = sm.ResolveProfile("work", username)
var amb *ErrAmbiguousHandle
require.ErrorAs(t, err, &amb)
assert.Equal(t, AmbiguityKindName, amb.Kind)
assert.Len(t, amb.Candidates, 2)
})
}
func TestServiceProfile_NotFound(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
_, err := sm.ResolveProfile("nope", username)
assert.ErrorIs(t, err, ErrProfileNotFound)
})
}
func TestServiceProfile_DefaultByExactID(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
got, err := sm.ResolveProfile(defaultProfileName, username)
require.NoError(t, err)
assert.Equal(t, defaultProfileName, got.ID.String())
})
}
func TestServiceProfile_LegacyFilenameCoexists(t *testing.T) {
// Legacy profiles stored as <name>.json with no "name" JSON field
// should still be discoverable by name and removable by name.
withTestSM(t, func(sm *ServiceManager, username string) {
configDir, err := sm.getConfigDir(username)
require.NoError(t, err)
path := filepath.Join(configDir, "legacy.json")
require.NoError(t, util.WriteJson(context.Background(), path, &Config{}))
got, err := sm.ResolveProfile("legacy", username)
require.NoError(t, err)
assert.Equal(t, "legacy", got.ID.String())
// Name falls back to the filename stem when JSON omits it.
assert.Equal(t, "legacy", got.Name)
})
}
func TestAddProfile_AllowsDuplicateWithFlag(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
first, err := sm.AddProfile("work", username)
require.NoError(t, err)
second, err := sm.AddProfile("work", username)
require.NoError(t, err)
assert.NotEqual(t, first.ID, second.ID)
assert.Equal(t, "work", second.Name)
})
}
func TestAddProfile_RejectsInvalidNames(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
cases := []string{
"", // empty
"\x00\x01", // only control chars (becomes empty)
strings.Repeat("a", maxProfileNameLen+1), // too long
}
for _, name := range cases {
_, err := sm.AddProfile(name, username)
assert.Error(t, err, "expected error for %q", name)
}
})
}
func TestRemoveProfile_RejectsInvalidID(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
err := sm.RemoveProfile("../escape", username)
assert.Error(t, err)
})
}
func TestSanitizeDisplayName(t *testing.T) {
cases := []struct {
in string
want string
wantErr bool
}{
{"work", "work", false},
{"My Work Account", "My Work Account", false},
{"emoji 🚀 ok", "emoji 🚀 ok", false},
{"漢字テスト", "漢字テスト", false},
{"with\x00null", "withnull", false},
{"\x01\x02\x03", "", true},
{"", "", true},
}
for _, tc := range cases {
got, err := sanitizeDisplayName(tc.in)
if tc.wantErr {
assert.Error(t, err, "case %q", tc.in)
continue
}
assert.NoError(t, err, "case %q", tc.in)
assert.Equal(t, tc.want, got, "case %q", tc.in)
}
}
func TestIsValidProfileFilenameStem(t *testing.T) {
cases := []struct {
in string
want bool
}{
{"default", true},
{"abc123def456", true},
{"legacy-name", true},
{"legacy_name", true},
{"", false},
{"..", false},
{"../etc", false},
{"foo/bar", false},
{`foo\bar`, false},
{"with space", false},
{"with.dot", false},
{strings.Repeat("a", maxProfileIDLen+1), false},
}
for _, tc := range cases {
got := IsValidProfileFilenameStem(ID(tc.in))
assert.Equal(t, tc.want, got, "case %q", tc.in)
}
}
func TestRemoveProfile_DeletesStateFile(t *testing.T) {
withTestSM(t, func(sm *ServiceManager, username string) {
created, err := sm.AddProfile("work", username)
require.NoError(t, err)
configDir, err := sm.getConfigDir(username)
require.NoError(t, err)
statePath := filepath.Join(configDir, created.ID.String()+".state.json")
require.NoError(t, os.WriteFile(statePath, []byte(`{"email":"a@b"}`), 0600))
require.NoError(t, sm.RemoveProfile(created.ID, username))
_, err = os.Stat(statePath)
assert.True(t, errors.Is(err, os.ErrNotExist), "state file should be removed")
})
}

View File

@@ -13,20 +13,13 @@ type ProfileState struct {
Email string `json:"email"`
}
// GetProfileState reads the per-profile state file keyed by profile ID.
// The state file lives in the user's config directory. Legacy state files
// keyed by the old profile name remain readable.
func (pm *ProfileManager) GetProfileState(id ID) (*ProfileState, error) {
func (pm *ProfileManager) GetProfileState(profileName string) (*ProfileState, error) {
configDir, err := getConfigDir()
if err != nil {
return nil, fmt.Errorf("get config directory: %w", err)
}
if id != defaultProfileName && !IsValidProfileFilenameStem(id) {
return nil, fmt.Errorf("invalid profile ID: %q", id)
}
stateFile := filepath.Join(configDir, id.String()+".state.json")
stateFile := filepath.Join(configDir, profileName+".state.json")
stateFileExists, err := fileExists(stateFile)
if err != nil {
return nil, fmt.Errorf("failed to check if profile state file exists: %w", err)
@@ -58,12 +51,7 @@ func (pm *ProfileManager) SetActiveProfileState(state *ProfileState) error {
return fmt.Errorf("get active profile: %w", err)
}
id := activeProf.ID
if id != defaultProfileName && !IsValidProfileFilenameStem(id) {
return fmt.Errorf("invalid active profile ID: %q", id)
}
stateFile := filepath.Join(configDir, id.String()+".state.json")
stateFile := filepath.Join(configDir, activeProf.Name+".state.json")
err = util.WriteJsonWithRestrictedPermission(context.Background(), stateFile, state)
if err != nil {
return fmt.Errorf("write profile state: %w", err)

View File

@@ -32,9 +32,6 @@ type ProbeResult struct {
URI string
Err error
Addr string
// Transport is the negotiated relay transport, empty
// for stun/turn probes or when not connected.
Transport string
}
type StunTurnProbe struct {

View File

@@ -22,14 +22,14 @@ type removePeerCall struct {
}
type mockServer struct {
mu sync.Mutex
addCalls []addPeerCall
removed []removePeerCall
nextID rp.PeerID
addErr error
removeErr error
closed bool
ran bool
mu sync.Mutex
addCalls []addPeerCall
removed []removePeerCall
nextID rp.PeerID
addErr error
removeErr error
closed bool
ran bool
}
func (m *mockServer) AddPeer(cfg rp.PeerConfig) (rp.PeerID, error) {
@@ -51,7 +51,7 @@ func (m *mockServer) RemovePeer(id rp.PeerID) error {
return m.removeErr
}
func (m *mockServer) Run() error { m.ran = true; return nil }
func (m *mockServer) Run() error { m.ran = true; return nil }
func (m *mockServer) Close() error { m.closed = true; return nil }
type setPSKCall struct {

View File

@@ -41,3 +41,4 @@ func TestDeterministicSeedKey_TooShortKey_ReturnsError(t *testing.T) {
_, err = DeterministicSeedKey(long, short)
require.Error(t, err)
}

View File

@@ -251,14 +251,6 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
r.MsgHdr.AuthenticatedData = true
}
// Advertise EDNS0 to the forwarder so it may return an Extended DNS Error
// describing why a lookup failed. The OPT is stripped from the reply when
// the original client did not request EDNS0.
hadEdns := r.IsEdns0() != nil
if !hadEdns {
r.SetEdns0(dns.DefaultMsgSize, false)
}
upstream := net.JoinHostPort(upstreamIP.String(), strconv.FormatUint(uint64(d.forwarderPort.Load()), 10))
ctx, cancel := context.WithTimeout(context.Background(), dnsTimeout)
defer cancel()
@@ -268,13 +260,6 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
return
}
if ede, ok := resutil.ExtractEDE(reply); ok {
resutil.SetMeta(w, "ede", fmt.Sprintf("%d %s", ede.InfoCode, ede.ExtraText))
}
if !hadEdns {
resutil.StripOPT(reply)
}
resutil.SetMeta(w, "peer", peerKey)
reply.Id = r.Id

View File

@@ -9,7 +9,6 @@ import (
"net/url"
"runtime"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
@@ -333,8 +332,6 @@ func (m *DefaultManager) Stop(stateManager *statemanager.Manager) {
}
}
m.notifier.Close()
m.mux.Lock()
defer m.mux.Unlock()
m.clientRoutes = nil
@@ -703,8 +700,6 @@ func resolveURLsToIPs(urls []string) []net.IP {
// updateRouteSelectorFromManagement updates the route selector based on the isSelected status from the management server
func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HAMap) {
m.mirrorV6ExitPairSelections(clientRoutes)
// An explicit user "deselect all" must not be overridden by management auto-apply.
// Auto-applying an exit node here would call SelectRoutes, which clears the
// deselect-all flag and re-enables every route the user turned off.
@@ -721,24 +716,6 @@ func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HA
m.logExitNodeUpdate(exitNodeInfo)
}
// mirrorV6ExitPairSelections keeps every synthesized "-v6" exit route's selection
// consistent with its v4 base. The v4/v6 exit pair is a single toggle, so the v6
// entry always follows the base: deselecting the v4 exit node also drops its ::/0
// pair, and any stale (orphaned) explicit selection on the v6 entry is reset. This
// runs before selection is read so both collectExitNodeInfo and FilterSelectedExitNodes
// see consistent state, including pairs loaded from persisted selector state.
func (m *DefaultManager) mirrorV6ExitPairSelections(clientRoutes route.HAMap) {
routesByNetID := make(map[route.NetID][]*route.Route, len(clientRoutes))
for haID, routes := range clientRoutes {
routesByNetID[haID.NetID()] = routes
}
for v6ID := range route.V6ExitMergeSet(routesByNetID) {
baseID := route.NetID(strings.TrimSuffix(string(v6ID), route.V6ExitSuffix))
m.routeSelector.SyncPairedSelection(baseID, v6ID)
}
}
type exitNodeInfo struct {
allIDs []route.NetID
selectedByManagement []route.NetID

View File

@@ -1,47 +0,0 @@
package routemanager
import (
"net/netip"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/netbirdio/netbird/client/internal/routeselector"
"github.com/netbirdio/netbird/route"
)
// TestUpdateRouteSelectorFromManagement_MirrorsV6ExitPair reproduces the bug seen
// in netbird-engine.log: persisted selector state has the v4 exit node deselected
// but its synthesized "-v6" pair explicitly selected (orphaned), so the ::/0 route
// leaked onto the tunnel. The management update must mirror the v4 deselect onto the
// v6 pair so FilterSelectedExitNodes drops it.
func TestUpdateRouteSelectorFromManagement_MirrorsV6ExitPair(t *testing.T) {
const (
v4ID = route.NetID("Exit Node (raspberrypi)")
v6ID = route.NetID("Exit Node (raspberrypi)-v6")
)
all := []route.NetID{v4ID, v6ID}
rs := routeselector.NewRouteSelector()
// Orphan the v6 selection: select the pair, then deselect only the v4 base.
require.NoError(t, rs.SelectRoutes([]route.NetID{v4ID, v6ID}, true, all))
require.NoError(t, rs.DeselectRoutes([]route.NetID{v4ID}, all))
require.True(t, rs.IsSelected(v6ID), "precondition: orphaned v6 selection survives v4 deselect")
m := &DefaultManager{routeSelector: rs}
v4Route := &route.Route{NetID: v4ID, Network: netip.MustParsePrefix("0.0.0.0/0")}
v6Route := &route.Route{NetID: v6ID, Network: netip.MustParsePrefix("::/0")}
clientRoutes := route.HAMap{
"Exit Node (raspberrypi)|0.0.0.0/0": {v4Route},
"Exit Node (raspberrypi)-v6|::/0": {v6Route},
}
m.updateRouteSelectorFromManagement(clientRoutes)
assert.False(t, rs.IsSelected(v6ID), "v6 pair must follow the v4 base deselect after the management update")
filtered := rs.FilterSelectedExitNodes(clientRoutes)
assert.Empty(t, filtered, "deselected v4 exit node must not leak its ::/0 pair onto the tunnel")
}

View File

@@ -16,7 +16,7 @@ import (
type Notifier struct {
initialRoutes []*route.Route
currentRoutes []*route.Route
fakeIPRoutes []*route.Route
fakeIPRoutes []*route.Route
listener listener.NetworkChangeListener
listenerMux sync.Mutex
@@ -119,7 +119,3 @@ func (n *Notifier) GetInitialRouteRanges() []string {
sort.Strings(initialStrings)
return initialStrings
}
func (n *Notifier) Close() {
// unused
}

View File

@@ -3,7 +3,6 @@
package notifier
import (
"container/list"
"net/netip"
"slices"
"sort"
@@ -15,26 +14,19 @@ import (
)
type Notifier struct {
mu sync.Mutex
cond *sync.Cond
currentPrefixes []string
listener listener.NetworkChangeListener
queue *list.List
closed bool
listener listener.NetworkChangeListener
listenerMux sync.Mutex
}
func NewNotifier() *Notifier {
n := &Notifier{
queue: list.New(),
}
n.cond = sync.NewCond(&n.mu)
go n.deliverLoop()
return n
return &Notifier{}
}
func (n *Notifier) SetListener(listener listener.NetworkChangeListener) {
n.mu.Lock()
defer n.mu.Unlock()
n.listenerMux.Lock()
defer n.listenerMux.Unlock()
n.listener = listener
}
@@ -51,52 +43,32 @@ func (n *Notifier) OnNewRoutes(route.HAMap) {
}
func (n *Notifier) OnNewPrefixes(prefixes []netip.Prefix) {
newNets := make([]string, 0, len(prefixes))
newNets := make([]string, 0)
for _, prefix := range prefixes {
newNets = append(newNets, prefix.String())
}
sort.Strings(newNets)
n.mu.Lock()
if slices.Equal(n.currentPrefixes, newNets) {
n.mu.Unlock()
return
}
n.currentPrefixes = newNets
routes := strings.Join(n.currentPrefixes, ",")
n.queue.PushBack(routes)
n.cond.Signal()
n.mu.Unlock()
}
func (n *Notifier) Close() {
n.mu.Lock()
n.closed = true
n.cond.Signal()
n.mu.Unlock()
n.currentPrefixes = newNets
n.notify()
}
func (n *Notifier) notify() {
n.listenerMux.Lock()
defer n.listenerMux.Unlock()
if n.listener == nil {
return
}
go func(l listener.NetworkChangeListener) {
l.OnNetworkChanged(strings.Join(n.currentPrefixes, ","))
}(n.listener)
}
func (n *Notifier) GetInitialRouteRanges() []string {
return nil
}
func (n *Notifier) deliverLoop() {
for {
n.mu.Lock()
for n.queue.Len() == 0 && !n.closed {
n.cond.Wait()
}
if n.closed && n.queue.Len() == 0 {
n.mu.Unlock()
return
}
routes := n.queue.Remove(n.queue.Front()).(string)
l := n.listener
n.mu.Unlock()
if l != nil {
l.OnNetworkChanged(routes)
}
}
}

View File

@@ -38,7 +38,3 @@ func (n *Notifier) OnNewPrefixes(prefixes []netip.Prefix) {
func (n *Notifier) GetInitialRouteRanges() []string {
return []string{}
}
func (n *Notifier) Close() {
// unused
}

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"slices"
"strings"
"sync"
"github.com/hashicorp/go-multierror"
@@ -131,33 +132,6 @@ func (rs *RouteSelector) IsSelected(routeID route.NetID) bool {
return rs.isSelectedLocked(routeID)
}
// SyncPairedSelection forces pairedID's explicit selection state to match baseID's,
// so a synthesized "-v6" exit route always follows its v4 base: selecting or
// deselecting the v4 exit node governs the ::/0 pair, and any stale (orphaned)
// explicit state on the v6 entry is reset. The v4/v6 exit pair is treated as a single
// toggle, so the v6 entry carries no independent selection of its own.
func (rs *RouteSelector) SyncPairedSelection(baseID, pairedID route.NetID) {
rs.mu.Lock()
defer rs.mu.Unlock()
if rs.deselectAll {
return
}
_, baseSelected := rs.selectedRoutes[baseID]
_, baseDeselected := rs.deselectedRoutes[baseID]
delete(rs.selectedRoutes, pairedID)
delete(rs.deselectedRoutes, pairedID)
switch {
case baseSelected:
rs.selectedRoutes[pairedID] = struct{}{}
case baseDeselected:
rs.deselectedRoutes[pairedID] = struct{}{}
}
}
// FilterSelected removes unselected routes from the provided map.
func (rs *RouteSelector) FilterSelected(routes route.HAMap) route.HAMap {
rs.mu.RLock()
@@ -177,13 +151,14 @@ func (rs *RouteSelector) FilterSelected(routes route.HAMap) route.HAMap {
}
// HasUserSelectionForRoute returns true if the user has explicitly selected or deselected this route.
// The lookup is literal; v4/v6 exit pairs are kept consistent at write time via SyncPairedSelection,
// so a synthesized "-v6" entry carries the same explicit state as its v4 base.
// Intended for exit-node code paths: a v6 exit-node pair (e.g. "MyExit-v6") with no explicit state of
// its own inherits its v4 base's state, so legacy persisted selections that predate v6 pairing
// transparently apply to the synthesized v6 entry.
func (rs *RouteSelector) HasUserSelectionForRoute(routeID route.NetID) bool {
rs.mu.RLock()
defer rs.mu.RUnlock()
return rs.hasUserSelectionForRouteLocked(routeID)
return rs.hasUserSelectionForRouteLocked(rs.effectiveNetID(routeID))
}
func (rs *RouteSelector) FilterSelectedExitNodes(routes route.HAMap) route.HAMap {
@@ -212,6 +187,83 @@ func (rs *RouteSelector) FilterSelectedExitNodes(routes route.HAMap) route.HAMap
return filtered
}
// effectiveNetID returns the v4 base for a "-v6" exit pair entry that has no explicit
// state of its own, so selections made on the v4 entry govern the v6 entry automatically.
// Only call this from exit-node-specific code paths: applying it to a non-exit "-v6" route
// would make it inherit unrelated v4 state. Must be called with rs.mu held.
func (rs *RouteSelector) effectiveNetID(id route.NetID) route.NetID {
name := string(id)
if !strings.HasSuffix(name, route.V6ExitSuffix) {
return id
}
if _, ok := rs.selectedRoutes[id]; ok {
return id
}
if _, ok := rs.deselectedRoutes[id]; ok {
return id
}
return route.NetID(strings.TrimSuffix(name, route.V6ExitSuffix))
}
func (rs *RouteSelector) isSelectedLocked(routeID route.NetID) bool {
if rs.deselectAll {
return false
}
_, deselected := rs.deselectedRoutes[routeID]
return !deselected
}
func (rs *RouteSelector) isDeselectedLocked(netID route.NetID) bool {
if rs.deselectAll {
return true
}
_, deselected := rs.deselectedRoutes[netID]
return deselected
}
func (rs *RouteSelector) hasUserSelectionForRouteLocked(routeID route.NetID) bool {
_, selected := rs.selectedRoutes[routeID]
_, deselected := rs.deselectedRoutes[routeID]
return selected || deselected
}
func isExitNode(rt []*route.Route) bool {
return len(rt) > 0 && (route.IsV4DefaultRoute(rt[0].Network) || route.IsV6DefaultRoute(rt[0].Network))
}
func (rs *RouteSelector) applyExitNodeFilter(
id route.HAUniqueID,
netID route.NetID,
rt []*route.Route,
out route.HAMap,
) {
// Exit-node path: apply the v4/v6 pair mirror so a deselect on the v4 base also
// drops the synthesized v6 entry that lacks its own explicit state.
effective := rs.effectiveNetID(netID)
if rs.hasUserSelectionForRouteLocked(effective) {
if rs.isSelectedLocked(effective) {
out[id] = rt
}
return
}
// no explicit selection for this route: defer to management's SkipAutoApply flag
sel := collectSelected(rt)
if len(sel) > 0 {
out[id] = sel
}
}
func collectSelected(rt []*route.Route) []*route.Route {
var sel []*route.Route
for _, r := range rt {
if !r.SkipAutoApply {
sel = append(sel, r)
}
}
return sel
}
// MarshalJSON implements the json.Marshaler interface
func (rs *RouteSelector) MarshalJSON() ([]byte, error) {
rs.mu.RLock()
@@ -265,59 +317,3 @@ func (rs *RouteSelector) UnmarshalJSON(data []byte) error {
return nil
}
func (rs *RouteSelector) isSelectedLocked(routeID route.NetID) bool {
if rs.deselectAll {
return false
}
_, deselected := rs.deselectedRoutes[routeID]
return !deselected
}
func (rs *RouteSelector) isDeselectedLocked(netID route.NetID) bool {
if rs.deselectAll {
return true
}
_, deselected := rs.deselectedRoutes[netID]
return deselected
}
func (rs *RouteSelector) hasUserSelectionForRouteLocked(routeID route.NetID) bool {
_, selected := rs.selectedRoutes[routeID]
_, deselected := rs.deselectedRoutes[routeID]
return selected || deselected
}
func (rs *RouteSelector) applyExitNodeFilter(
id route.HAUniqueID,
netID route.NetID,
rt []*route.Route,
out route.HAMap,
) {
if rs.hasUserSelectionForRouteLocked(netID) {
if rs.isSelectedLocked(netID) {
out[id] = rt
}
return
}
// no explicit selection for this route: defer to management's SkipAutoApply flag
sel := collectSelected(rt)
if len(sel) > 0 {
out[id] = sel
}
}
func isExitNode(rt []*route.Route) bool {
return len(rt) > 0 && (route.IsV4DefaultRoute(rt[0].Network) || route.IsV6DefaultRoute(rt[0].Network))
}
func collectSelected(rt []*route.Route) []*route.Route {
var sel []*route.Route
for _, r := range rt {
if !r.SkipAutoApply {
sel = append(sel, r)
}
}
return sel
}

View File

@@ -330,73 +330,39 @@ func TestRouteSelector_FilterSelectedExitNodes(t *testing.T) {
assert.Len(t, filtered, 0) // No routes should be selected
}
// TestRouteSelector_V6ExitPairSync covers SyncPairedSelection, which keeps a v4
// exit node and its synthesized "-v6" counterpart consistent. The selector itself
// is literal and never infers a v6 entry's state from its v4 base; callers that know
// the pairing (exit-node code paths) call SyncPairedSelection to force the v6 entry
// to follow the base, treating the pair as a single toggle.
func TestRouteSelector_V6ExitPairSync(t *testing.T) {
// TestRouteSelector_V6ExitPairInherits covers the v4/v6 exit-node pair selection
// mirror. The mirror is scoped to exit-node code paths: HasUserSelectionForRoute
// and FilterSelectedExitNodes resolve a "-v6" entry without explicit state to its
// v4 base, so legacy persisted selections that predate v6 pairing transparently
// apply to the synthesized v6 entry. General lookups (IsSelected, FilterSelected)
// stay literal so unrelated routes named "*-v6" don't inherit unrelated state.
func TestRouteSelector_V6ExitPairInherits(t *testing.T) {
all := []route.NetID{"exit1", "exit1-v6", "exit2", "exit2-v6", "corp", "corp-v6"}
t.Run("selector lookups stay literal without sync", func(t *testing.T) {
t.Run("HasUserSelectionForRoute mirrors deselected v4 base", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"exit1"}, all))
// The selector does not pair-resolve: the v6 entry is independent until synced.
assert.False(t, rs.HasUserSelectionForRoute("exit1-v6"), "v6 entry has no state of its own")
assert.True(t, rs.IsSelected("exit1-v6"), "unsynced v6 entry stays selected by default")
assert.True(t, rs.HasUserSelectionForRoute("exit1-v6"), "v6 pair sees v4 base's user selection")
// A route literally named "exit1-something" must never pair-resolve either.
assert.False(t, rs.HasUserSelectionForRoute("exit1-something"))
// unrelated v6 with no v4 base touched is unaffected
assert.False(t, rs.HasUserSelectionForRoute("exit2-v6"))
})
t.Run("sync mirrors deselected v4 base onto v6", func(t *testing.T) {
t.Run("IsSelected stays literal for non-exit lookups", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"corp"}, all))
// A non-exit route literally named "corp-v6" must not inherit "corp"'s state
// via the mirror; the mirror only applies in exit-node code paths.
assert.False(t, rs.IsSelected("corp"))
assert.True(t, rs.IsSelected("corp-v6"), "non-exit *-v6 routes must not inherit unrelated v4 state")
})
t.Run("explicit v6 state overrides v4 base in filter", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"exit1"}, all))
rs.SyncPairedSelection("exit1", "exit1-v6")
assert.False(t, rs.IsSelected("exit1"))
assert.False(t, rs.IsSelected("exit1-v6"), "v6 pair follows v4 base deselect")
assert.True(t, rs.HasUserSelectionForRoute("exit1-v6"), "v6 carries explicit deselect after sync")
})
t.Run("sync mirrors selected v4 base onto v6", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.SelectRoutes([]route.NetID{"exit1"}, false, all))
rs.SyncPairedSelection("exit1", "exit1-v6")
assert.True(t, rs.IsSelected("exit1"))
assert.True(t, rs.IsSelected("exit1-v6"), "v6 pair follows v4 base select")
})
t.Run("sync clears v6 state when base has no explicit selection", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.SelectRoutes([]route.NetID{"exit1-v6"}, true, all))
require.True(t, rs.HasUserSelectionForRoute("exit1-v6"))
rs.SyncPairedSelection("exit1", "exit1-v6")
assert.False(t, rs.HasUserSelectionForRoute("exit1-v6"),
"v6 explicit state is cleared so it follows management like its base")
})
// Regression for the observed bug (see netbird-engine.log): persisted state has
// the v4 base deselected but the v6 sibling explicitly selected (orphaned). The
// sync must reset the orphan so the ::/0 route does not leak onto the tunnel.
t.Run("sync clears orphaned explicit v6 selection on deselected base", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
// Prior state: both explicitly selected, then only the v4 base deselected,
// leaving the v6 entry as a stale explicit selection.
require.NoError(t, rs.SelectRoutes([]route.NetID{"exit1", "exit1-v6"}, true, all))
require.NoError(t, rs.DeselectRoutes([]route.NetID{"exit1"}, all))
require.True(t, rs.IsSelected("exit1-v6"), "precondition: orphaned v6 selection")
rs.SyncPairedSelection("exit1", "exit1-v6")
assert.False(t, rs.IsSelected("exit1-v6"), "orphaned v6 selection reset to follow v4 deselect")
v4Route := &route.Route{NetID: "exit1", Network: netip.MustParsePrefix("0.0.0.0/0")}
v6Route := &route.Route{NetID: "exit1-v6", Network: netip.MustParsePrefix("::/0")}
@@ -404,14 +370,23 @@ func TestRouteSelector_V6ExitPairSync(t *testing.T) {
"exit1|0.0.0.0/0": {v4Route},
"exit1-v6|::/0": {v6Route},
}
filtered := rs.FilterSelectedExitNodes(routes)
assert.Empty(t, filtered, "deselecting v4 base must drop the v6 pair even if it was explicitly selected before")
assert.NotContains(t, filtered, route.HAUniqueID("exit1|0.0.0.0/0"))
assert.Contains(t, filtered, route.HAUniqueID("exit1-v6|::/0"), "explicit v6 select wins over v4 base")
})
t.Run("filter drops synced v6 pair of deselected v4 base", func(t *testing.T) {
t.Run("non-v6-suffix routes unaffected", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"exit1"}, all))
// A route literally named "exit1-something" must not pair-resolve.
assert.False(t, rs.HasUserSelectionForRoute("exit1-something"))
})
t.Run("filter v6 paired with deselected v4 base", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"exit1"}, all))
rs.SyncPairedSelection("exit1", "exit1-v6")
v4Route := &route.Route{NetID: "exit1", Network: netip.MustParsePrefix("0.0.0.0/0")}
v6Route := &route.Route{NetID: "exit1-v6", Network: netip.MustParsePrefix("::/0")}
@@ -424,15 +399,6 @@ func TestRouteSelector_V6ExitPairSync(t *testing.T) {
assert.Empty(t, filtered, "deselecting v4 base must also drop the v6 pair")
})
t.Run("deselectAll makes sync a no-op", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
rs.DeselectAllRoutes()
rs.SyncPairedSelection("exit1", "exit1-v6")
assert.False(t, rs.HasUserSelectionForRoute("exit1-v6"), "sync must not write explicit state under deselectAll")
})
t.Run("non-exit *-v6 routes pass through FilterSelectedExitNodes", func(t *testing.T) {
rs := routeselector.NewRouteSelector()
require.NoError(t, rs.DeselectRoutes([]route.NetID{"corp"}, all))

View File

@@ -17,7 +17,6 @@ import (
"github.com/netbirdio/netbird/client/internal"
"github.com/netbirdio/netbird/client/internal/auth"
"github.com/netbirdio/netbird/client/internal/debug"
"github.com/netbirdio/netbird/client/internal/dns"
"github.com/netbirdio/netbird/client/internal/listener"
"github.com/netbirdio/netbird/client/internal/peer"
@@ -26,7 +25,6 @@ import (
"github.com/netbirdio/netbird/formatter"
"github.com/netbirdio/netbird/route"
"github.com/netbirdio/netbird/shared/management/domain"
types "github.com/netbirdio/netbird/upload-server/types"
)
// ConnectionListener export internal Listener for mobile
@@ -56,7 +54,6 @@ type selectRoute struct {
Network netip.Prefix
Domains domain.List
Selected bool
Status string
extraNetworks []netip.Prefix
}
@@ -68,8 +65,6 @@ func init() {
type Client struct {
cfgFile string
stateFile string
cacheDir string
logFilePath string
recorder *peer.Status
ctxCancel context.CancelFunc
ctxCancelLock *sync.Mutex
@@ -80,21 +75,16 @@ type Client struct {
onHostDnsFn func([]string)
dnsManager dns.IosDnsManager
loginComplete bool
connectClient *internal.ConnectClient
// preloadedConfig holds config loaded from JSON (used on tvOS where file writes are blocked)
preloadedConfig *profilemanager.Config
stateMu sync.RWMutex
connectClient *internal.ConnectClient
config *profilemanager.Config
}
// NewClient instantiate a new Client
func NewClient(cfgFile, stateFile, cacheDir, logFilePath, deviceName string, osVersion string, osName string, networkChangeListener NetworkChangeListener, dnsManager DnsManager) *Client {
func NewClient(cfgFile, stateFile, deviceName string, osVersion string, osName string, networkChangeListener NetworkChangeListener, dnsManager DnsManager) *Client {
return &Client{
cfgFile: cfgFile,
stateFile: stateFile,
cacheDir: cacheDir,
logFilePath: logFilePath,
deviceName: deviceName,
osName: osName,
osVersion: osVersion,
@@ -171,13 +161,8 @@ func (c *Client) Run(fd int32, interfaceName string, envList *EnvList) error {
c.onHostDnsFn = func([]string) {}
cfg.WgIface = interfaceName
connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
c.setState(cfg, connectClient)
// Persist the latest sync response so DebugBundle can include the network
// map. On iOS this is backed by disk to keep it out of the constrained
// process memory (see the syncstore package).
connectClient.SetSyncResponsePersistence(true)
return connectClient.RunOniOS(fd, c.networkChangeListener, c.dnsManager, c.stateFile, c.cacheDir, c.logFilePath)
c.connectClient = internal.NewConnectClient(ctx, cfg, c.recorder)
return c.connectClient.RunOniOS(fd, c.networkChangeListener, c.dnsManager, c.stateFile)
}
// Stop the internal client and free the resources
@@ -189,84 +174,6 @@ func (c *Client) Stop() {
}
c.ctxCancel()
c.setState(nil, nil)
}
// DebugBundle generates a debug bundle, uploads it and returns the upload key.
// It works with or without a running engine: when the engine is up it reuses
// the live config, sync response and client metrics; otherwise it loads the
// config from disk (or the preloaded tvOS config).
func (c *Client) DebugBundle(anonymize bool) (string, error) {
cfg, cc := c.stateSnapshot()
// If the engine hasn't been started, load config so we can reach management.
if cfg == nil {
if c.preloadedConfig != nil {
cfg = c.preloadedConfig
} else {
var err error
// Use DirectUpdateOrCreateConfig to avoid atomic file operations
// (temp file + rename) blocked by the tvOS sandbox.
cfg, err = profilemanager.DirectUpdateOrCreateConfig(profilemanager.ConfigInput{
ConfigPath: c.cfgFile,
StateFilePath: c.stateFile,
})
if err != nil {
return "", fmt.Errorf("load config: %w", err)
}
}
}
deps := debug.GeneratorDependencies{
InternalConfig: cfg,
StatusRecorder: c.recorder,
TempDir: c.cacheDir,
StatePath: c.stateFile,
LogPath: c.logFilePath,
}
if cc != nil {
resp, err := cc.GetLatestSyncResponse()
if err != nil {
log.Warnf("get latest sync response: %v", err)
}
deps.SyncResponse = resp
if e := cc.Engine(); e != nil {
if cm := e.GetClientMetrics(); cm != nil {
deps.ClientMetrics = cm
}
}
}
bundleGenerator := debug.NewBundleGenerator(
deps,
debug.BundleConfig{
Anonymize: anonymize,
IncludeSystemInfo: true,
},
)
path, err := bundleGenerator.Generate()
if err != nil {
return "", fmt.Errorf("generate debug bundle: %w", err)
}
defer func() {
if err := os.Remove(path); err != nil {
log.Errorf("failed to remove debug bundle file: %v", err)
}
}()
uploadCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
key, err := debug.UploadDebugBundle(uploadCtx, types.DefaultBundleURL, cfg.ManagementURL.String(), path)
if err != nil {
return "", fmt.Errorf("upload debug bundle: %w", err)
}
log.Infof("debug bundle uploaded with key %s", key)
return key, nil
}
// SetTraceLogLevel configure the logger to trace level
@@ -320,16 +227,6 @@ func (c *Client) RemoveConnectionListener() {
c.recorder.RemoveConnectionListener()
}
// IsLoginRequiredCached reports whether the LAST observed management error was an
// auth failure (PermissionDenied/InvalidArgument), using the in-memory status
// recorder. Unlike IsLoginRequired() it performs NO network call, so it is safe to
// call from the connection listener during teardown (e.g. onDisconnected) without
// blocking on a slow or unavailable network. Returns false while connected to
// management or when the last error was not auth-related.
func (c *Client) IsLoginRequiredCached() bool {
return c.recorder.IsLoginRequired()
}
func (c *Client) IsLoginRequired() bool {
var ctx context.Context
//nolint
@@ -457,12 +354,11 @@ func (c *Client) ClearLoginComplete() {
}
func (c *Client) GetRoutesSelectionDetails() (*RoutesSelectionDetails, error) {
_, connectClient := c.stateSnapshot()
if connectClient == nil {
if c.connectClient == nil {
return nil, fmt.Errorf("not connected")
}
engine := connectClient.Engine()
engine := c.connectClient.Engine()
if engine == nil {
return nil, fmt.Errorf("not connected")
}
@@ -481,57 +377,9 @@ func (c *Client) GetRoutesSelectionDetails() (*RoutesSelectionDetails, error) {
routes := buildSelectRoutes(routesMap, routeSelector.IsSelected, v6ExitMerged)
resolvedDomains := c.recorder.GetResolvedDomainsStates()
// Compute each route's connection status in the core (mirroring the Android
// bridge), so the UI doesn't have to infer it by string-matching the joined
// Network value against peer routes. For a merged exit node the status reflects
// whichever of the v4/v6 prefixes is served by a connected peer; for dynamic
// (DNS) routes the peer route key is the domain pattern (see dynamic.Route.String).
connectedRoutes := c.connectedRouteSet()
for _, r := range routes {
r.Status = routeStatus(r, connectedRoutes)
}
return prepareRouteSelectionDetails(routes, resolvedDomains), nil
}
// connectedRouteSet returns the set of route keys (as strings) currently served by a
// connected peer, gathered across all connected peers' route tables. The keys match
// what the route manager records: a prefix string for static routes (e.g. "0.0.0.0/0")
// and the domain pattern for dynamic routes (e.g. "*.example.com").
func (c *Client) connectedRouteSet() map[string]struct{} {
connected := map[string]struct{}{}
for _, p := range c.recorder.GetFullStatus().Peers {
if p.ConnStatus != peer.StatusConnected {
continue
}
for r := range p.GetRoutes() {
connected[r] = struct{}{}
}
}
return connected
}
// routeStatus reports "Connected" if any of the route's keys is served by a connected
// peer: the primary Network prefix, an extra v6 network of a merged exit node, or the
// domain pattern for a dynamic DNS route. Otherwise "Idle".
func routeStatus(r *selectRoute, connectedRoutes map[string]struct{}) string {
keys := make([]string, 0, 1+len(r.extraNetworks))
if len(r.Domains) > 0 {
keys = append(keys, r.Domains.SafeString())
} else {
keys = append(keys, r.Network.String())
}
for _, extra := range r.extraNetworks {
keys = append(keys, extra.String())
}
for _, k := range keys {
if _, ok := connectedRoutes[k]; ok {
return peer.StatusConnected.String()
}
}
return peer.StatusIdle.String()
}
func buildSelectRoutes(routesMap map[route.NetID][]*route.Route, isSelected func(route.NetID) bool, v6Merged map[route.NetID]struct{}) []*selectRoute {
var routes []*selectRoute
for id, rt := range routesMap {
@@ -614,7 +462,6 @@ func prepareRouteSelectionDetails(routes []*selectRoute, resolvedDomains map[dom
Network: netStr,
Domains: &domainDetails,
Selected: r.Selected,
Status: r.Status,
})
}
@@ -623,12 +470,11 @@ func prepareRouteSelectionDetails(routes []*selectRoute, resolvedDomains map[dom
}
func (c *Client) SelectRoute(id string) error {
_, connectClient := c.stateSnapshot()
if connectClient == nil {
if c.connectClient == nil {
return fmt.Errorf("not connected")
}
engine := connectClient.Engine()
engine := c.connectClient.Engine()
if engine == nil {
return fmt.Errorf("not connected")
}
@@ -654,11 +500,10 @@ func (c *Client) SelectRoute(id string) error {
}
func (c *Client) DeselectRoute(id string) error {
_, connectClient := c.stateSnapshot()
if connectClient == nil {
if c.connectClient == nil {
return fmt.Errorf("not connected")
}
engine := connectClient.Engine()
engine := c.connectClient.Engine()
if engine == nil {
return fmt.Errorf("not connected")
}
@@ -682,22 +527,6 @@ func (c *Client) DeselectRoute(id string) error {
return nil
}
// setState stores the running engine state so DebugBundle can reuse the live
// config and ConnectClient. It is cleared on Stop.
func (c *Client) setState(cfg *profilemanager.Config, cc *internal.ConnectClient) {
c.stateMu.Lock()
defer c.stateMu.Unlock()
c.config = cfg
c.connectClient = cc
}
// stateSnapshot returns the current config and ConnectClient under the lock.
func (c *Client) stateSnapshot() (*profilemanager.Config, *internal.ConnectClient) {
c.stateMu.RLock()
defer c.stateMu.RUnlock()
return c.config, c.connectClient
}
func formatDuration(d time.Duration) string {
ds := d.String()
dotIndex := strings.Index(ds, ".")

View File

@@ -36,7 +36,6 @@ type URLOpener interface {
// Auth can register or login new client
type Auth struct {
ctx context.Context
cancel context.CancelFunc
config *profilemanager.Config
cfgPath string
}
@@ -52,19 +51,8 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {
return nil, err
}
// Use a cancellable context so Stop() can abort an in-progress interactive
// login. The PKCE flow's WaitToken blocks (and keeps its loopback HTTP server
// bound to a port) until the OAuth callback arrives or the flow expires;
// cancelling the context unblocks WaitToken, which then shuts that server down
// and frees the port for the next login attempt. iOS runs login in the main-app
// process (decoupled from the network extension), so without this the server
// lingers after the user dismisses the browser and the next connect stalls
// trying to bind the same port.
ctx, cancel := context.WithCancel(context.Background())
return &Auth{
ctx: ctx,
cancel: cancel,
ctx: context.Background(),
config: cfg,
cfgPath: cfgPath,
}, nil
@@ -72,24 +60,12 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {
// NewAuthWithConfig instantiate Auth based on existing config
func NewAuthWithConfig(ctx context.Context, config *profilemanager.Config) *Auth {
ctx, cancel := context.WithCancel(ctx)
return &Auth{
ctx: ctx,
cancel: cancel,
config: config,
}
}
// Stop aborts an in-progress interactive login started via Login/LoginWithDeviceName.
// It cancels the auth context, which unblocks the PKCE WaitToken and shuts down its
// loopback HTTP server, freeing the redirect port. Safe to call multiple times and
// safe to call when no login is running.
func (a *Auth) Stop() {
if a.cancel != nil {
a.cancel()
}
}
// SaveConfigIfSSOSupported test the connectivity with the management server by retrieving the server device flow info.
// If it returns a flow info than save the configuration and return true. If it gets a codes.NotFound, it means that SSO
// is not supported and returns false without saving the configuration. For other errors return false.

View File

@@ -20,7 +20,6 @@ type RoutesSelectionInfo struct {
Network string
Domains *DomainDetails
Selected bool
Status string
}
type DomainCollection interface {

File diff suppressed because it is too large Load Diff

View File

@@ -85,8 +85,6 @@ service DaemonService {
rpc AddProfile(AddProfileRequest) returns (AddProfileResponse) {}
rpc RenameProfile(RenameProfileRequest) returns (RenameProfileResponse) {}
rpc RemoveProfile(RemoveProfileRequest) returns (RemoveProfileResponse) {}
rpc ListProfiles(ListProfilesRequest) returns (ListProfilesResponse) {}
@@ -380,9 +378,6 @@ message RelayState {
string URI = 1;
bool available = 2;
string error = 3;
// transport is the negotiated relay transport (e.g. "ws", "quic"),
// empty for stun/turn probes or when not connected.
string transport = 4;
}
message NSGroupState {
@@ -627,18 +622,11 @@ message GetEventsResponse {
}
message SwitchProfileRequest {
// profileName is treated as a handle: exact ID, unique ID prefix, or
// unique display name. The daemon resolves it server-side.
optional string profileName = 1;
optional string username = 2;
}
message SwitchProfileResponse {
// id is the resolved on-disk ID of the profile that became active.
// Lets CLI clients update their local active-profile state without
// duplicating the resolution logic.
string id = 1;
}
message SwitchProfileResponse {}
message SetConfigRequest {
string username = 1;
@@ -705,42 +693,17 @@ message SetConfigResponse{}
message AddProfileRequest {
string username = 1;
// profileName carries the human-readable display name for the new
// profile. The on-disk filename is a separately-generated ID.
string profileName = 2;
}
message AddProfileResponse {
// id is the generated on-disk ID of the new profile. CLI clients
// display a truncated form, UI clients can ignore it.
string id = 1;
}
message RenameProfileRequest {
string username = 1;
// handle: an exact ID, a unique ID prefix, or a unique display name.
string handle = 2;
// newProfileName is the new human-readable display name for the profile.
string newProfileName = 3;
}
message RenameProfileResponse {
// confirm the old profile name after resolving handle.
string oldProfileName = 1;
}
message AddProfileResponse {}
message RemoveProfileRequest {
string username = 1;
// profileName is treated as a handle: an exact ID, a unique ID
// prefix, or a unique display name. Resolution happens server-side.
string profileName = 2;
}
message RemoveProfileResponse {
// id is the full resolved ID of the removed profile, so callers can
// confirm exactly which profile a name/prefix handle resolved to.
string id = 1;
}
message RemoveProfileResponse {}
message ListProfilesRequest {
string username = 1;
@@ -753,7 +716,6 @@ message ListProfilesResponse {
message Profile {
string name = 1;
bool is_active = 2;
string id = 3;
}
message GetActiveProfileRequest {}
@@ -761,7 +723,6 @@ message GetActiveProfileRequest {}
message GetActiveProfileResponse {
string profileName = 1;
string username = 2;
string id = 3;
}
message LogoutRequest {

View File

@@ -45,7 +45,6 @@ const (
DaemonService_SwitchProfile_FullMethodName = "/daemon.DaemonService/SwitchProfile"
DaemonService_SetConfig_FullMethodName = "/daemon.DaemonService/SetConfig"
DaemonService_AddProfile_FullMethodName = "/daemon.DaemonService/AddProfile"
DaemonService_RenameProfile_FullMethodName = "/daemon.DaemonService/RenameProfile"
DaemonService_RemoveProfile_FullMethodName = "/daemon.DaemonService/RemoveProfile"
DaemonService_ListProfiles_FullMethodName = "/daemon.DaemonService/ListProfiles"
DaemonService_GetActiveProfile_FullMethodName = "/daemon.DaemonService/GetActiveProfile"
@@ -113,7 +112,6 @@ type DaemonServiceClient interface {
SwitchProfile(ctx context.Context, in *SwitchProfileRequest, opts ...grpc.CallOption) (*SwitchProfileResponse, error)
SetConfig(ctx context.Context, in *SetConfigRequest, opts ...grpc.CallOption) (*SetConfigResponse, error)
AddProfile(ctx context.Context, in *AddProfileRequest, opts ...grpc.CallOption) (*AddProfileResponse, error)
RenameProfile(ctx context.Context, in *RenameProfileRequest, opts ...grpc.CallOption) (*RenameProfileResponse, error)
RemoveProfile(ctx context.Context, in *RemoveProfileRequest, opts ...grpc.CallOption) (*RemoveProfileResponse, error)
ListProfiles(ctx context.Context, in *ListProfilesRequest, opts ...grpc.CallOption) (*ListProfilesResponse, error)
GetActiveProfile(ctx context.Context, in *GetActiveProfileRequest, opts ...grpc.CallOption) (*GetActiveProfileResponse, error)
@@ -424,16 +422,6 @@ func (c *daemonServiceClient) AddProfile(ctx context.Context, in *AddProfileRequ
return out, nil
}
func (c *daemonServiceClient) RenameProfile(ctx context.Context, in *RenameProfileRequest, opts ...grpc.CallOption) (*RenameProfileResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(RenameProfileResponse)
err := c.cc.Invoke(ctx, DaemonService_RenameProfile_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
func (c *daemonServiceClient) RemoveProfile(ctx context.Context, in *RemoveProfileRequest, opts ...grpc.CallOption) (*RemoveProfileResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(RemoveProfileResponse)
@@ -625,7 +613,6 @@ type DaemonServiceServer interface {
SwitchProfile(context.Context, *SwitchProfileRequest) (*SwitchProfileResponse, error)
SetConfig(context.Context, *SetConfigRequest) (*SetConfigResponse, error)
AddProfile(context.Context, *AddProfileRequest) (*AddProfileResponse, error)
RenameProfile(context.Context, *RenameProfileRequest) (*RenameProfileResponse, error)
RemoveProfile(context.Context, *RemoveProfileRequest) (*RemoveProfileResponse, error)
ListProfiles(context.Context, *ListProfilesRequest) (*ListProfilesResponse, error)
GetActiveProfile(context.Context, *GetActiveProfileRequest) (*GetActiveProfileResponse, error)
@@ -736,9 +723,6 @@ func (UnimplementedDaemonServiceServer) SetConfig(context.Context, *SetConfigReq
func (UnimplementedDaemonServiceServer) AddProfile(context.Context, *AddProfileRequest) (*AddProfileResponse, error) {
return nil, status.Error(codes.Unimplemented, "method AddProfile not implemented")
}
func (UnimplementedDaemonServiceServer) RenameProfile(context.Context, *RenameProfileRequest) (*RenameProfileResponse, error) {
return nil, status.Error(codes.Unimplemented, "method RenameProfile not implemented")
}
func (UnimplementedDaemonServiceServer) RemoveProfile(context.Context, *RemoveProfileRequest) (*RemoveProfileResponse, error) {
return nil, status.Error(codes.Unimplemented, "method RemoveProfile not implemented")
}
@@ -1253,24 +1237,6 @@ func _DaemonService_AddProfile_Handler(srv interface{}, ctx context.Context, dec
return interceptor(ctx, in, info, handler)
}
func _DaemonService_RenameProfile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(RenameProfileRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(DaemonServiceServer).RenameProfile(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: DaemonService_RenameProfile_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(DaemonServiceServer).RenameProfile(ctx, req.(*RenameProfileRequest))
}
return interceptor(ctx, in, info, handler)
}
func _DaemonService_RemoveProfile_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(RemoveProfileRequest)
if err := dec(in); err != nil {
@@ -1601,10 +1567,6 @@ var DaemonService_ServiceDesc = grpc.ServiceDesc{
MethodName: "AddProfile",
Handler: _DaemonService_AddProfile_Handler,
},
{
MethodName: "RenameProfile",
Handler: _DaemonService_RenameProfile_Handler,
},
{
MethodName: "RemoveProfile",
Handler: _DaemonService_RemoveProfile_Handler,

View File

@@ -79,7 +79,7 @@ func TestPersistLoginOverrides(t *testing.T) {
_, err := profilemanager.UpdateOrCreateConfig(seed)
require.NoError(t, err, "seed config")
activeProf := &profilemanager.ActiveProfileState{ID: "default"}
activeProf := &profilemanager.ActiveProfileState{Name: "default"}
err = persistLoginOverrides(activeProf, tt.newMgmtURL, tt.newPSK)
require.NoError(t, err, "persistLoginOverrides")

View File

@@ -78,7 +78,7 @@ type Server struct {
// changed by connectWithRetryRuns goroutine exit — for that
// (goroutine-still-alive) check, see connectionGoroutineRunning() which
// derives from clientGiveUpChan close state. Protected by s.mutex.
clientRunning bool
clientRunning bool
clientRunningChan chan struct{}
clientGiveUpChan chan struct{} // closed when connectWithRetryRuns goroutine exits
@@ -375,7 +375,7 @@ func (s *Server) SetConfig(callerCtx context.Context, msg *proto.SetConfigReques
return nil, err
}
config, err := s.setConfigInputFromRequest(msg)
config, err := setConfigInputFromRequest(msg)
if err != nil {
return nil, err
}
@@ -398,17 +398,17 @@ func (s *Server) SetConfig(callerCtx context.Context, msg *proto.SetConfigReques
// field is its own optional case. Returns the resolved ConfigInput
// and a non-nil error only when the active profile file path cannot
// be determined.
func (s *Server) setConfigInputFromRequest(msg *proto.SetConfigRequest) (profilemanager.ConfigInput, error) {
func setConfigInputFromRequest(msg *proto.SetConfigRequest) (profilemanager.ConfigInput, error) {
var config profilemanager.ConfigInput
resolved, err := s.resolveProfileHandle(msg.ProfileName, msg.Username)
if err != nil {
log.Errorf("failed to resolve profile %q: %v", msg.ProfileName, err)
return config, err
profState := profilemanager.ActiveProfileState{
Name: msg.ProfileName,
Username: msg.Username,
}
profPath := resolved.Path
if profPath == "" {
profPath = profilemanager.DefaultConfigPath
profPath, err := profState.FilePath()
if err != nil {
log.Errorf("failed to get active profile file path: %v", err)
return config, fmt.Errorf("failed to get active profile file path: %w", err)
}
config.ConfigPath = profPath
@@ -535,9 +535,30 @@ func (s *Server) Login(callerCtx context.Context, msg *proto.LoginRequest) (*pro
}
if msg.ProfileName != nil {
if _, err := s.switchProfileIfNeeded(*msg.ProfileName, msg.Username, activeProf); err != nil {
log.Errorf("failed to switch profile: %v", err)
return nil, err
if *msg.ProfileName != "default" && (msg.Username == nil || *msg.Username == "") {
log.Errorf("profile name is set to %s, but username is not provided", *msg.ProfileName)
return nil, fmt.Errorf("profile name is set to %s, but username is not provided", *msg.ProfileName)
}
var username string
if *msg.ProfileName != "default" {
username = *msg.Username
}
if *msg.ProfileName != activeProf.Name && username != activeProf.Username {
if s.checkProfilesDisabled() {
log.Errorf("profiles are disabled, you cannot use this feature without profiles enabled")
return nil, gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
}
log.Infof("switching to profile %s for user '%s'", *msg.ProfileName, username)
if err := s.profileManager.SetActiveProfileState(&profilemanager.ActiveProfileState{
Name: *msg.ProfileName,
Username: username,
}); err != nil {
log.Errorf("failed to set active profile state: %v", err)
return nil, fmt.Errorf("failed to set active profile state: %w", err)
}
}
}
@@ -547,7 +568,7 @@ func (s *Server) Login(callerCtx context.Context, msg *proto.LoginRequest) (*pro
return nil, fmt.Errorf("failed to get active profile state: %w", err)
}
log.Infof("active profile: %s for %s", activeProf.ID, activeProf.Username)
log.Infof("active profile: %s for %s", activeProf.Name, activeProf.Username)
s.mutex.Lock()
@@ -785,10 +806,10 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
}
if msg != nil && msg.ProfileName != nil {
if _, err := s.switchProfileIfNeeded(*msg.ProfileName, msg.Username, activeProf); err != nil {
if err := s.switchProfileIfNeeded(*msg.ProfileName, msg.Username, activeProf); err != nil {
s.mutex.Unlock()
log.Errorf("failed to switch profile: %v", err)
return nil, err
return nil, fmt.Errorf("failed to switch profile: %w", err)
}
}
@@ -799,7 +820,7 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
return nil, fmt.Errorf("failed to get active profile state: %w", err)
}
log.Infof("active profile: %s for %s", activeProf.ID, activeProf.Username)
log.Infof("active profile: %s for %s", activeProf.Name, activeProf.Username)
config, _, err := s.getConfig(activeProf)
if err != nil {
@@ -843,60 +864,34 @@ func (s *Server) waitForUp(callerCtx context.Context) (*proto.UpResponse, error)
}
}
// resolveProfileHandle resolves a wire-level profile handle (display
// name, ID, or unique ID prefix) to a concrete profile. Returns gRPC
// status errors so handlers can return them directly.
func (s *Server) resolveProfileHandle(handle, username string) (*profilemanager.Profile, error) {
p, err := s.profileManager.ResolveProfile(handle, username)
if err == nil {
return p, nil
}
var amb *profilemanager.ErrAmbiguousHandle
if errors.As(err, &amb) {
return nil, gstatus.Errorf(codes.InvalidArgument, "%v", amb)
}
if errors.Is(err, profilemanager.ErrProfileNotFound) {
return nil, gstatus.Errorf(codes.NotFound, "profile %q not found", handle)
}
return nil, fmt.Errorf("resolve profile: %w", err)
}
// switchProfileIfNeeded resolves the user-supplied handle, updates the
// active profile state if it differs from the current one, and returns
// the resolved profile so callers can include its ID in RPC responses.
func (s *Server) switchProfileIfNeeded(handle string, userName *string, activeProf *profilemanager.ActiveProfileState) (*profilemanager.Profile, error) {
if handle != profilemanager.DefaultProfileName && (userName == nil || *userName == "") {
log.Errorf("profile name is set to %s, but username is not provided", handle)
return nil, fmt.Errorf("profile name is set to %s, but username is not provided", handle)
func (s *Server) switchProfileIfNeeded(profileName string, userName *string, activeProf *profilemanager.ActiveProfileState) error {
if profileName != "default" && (userName == nil || *userName == "") {
log.Errorf("profile name is set to %s, but username is not provided", profileName)
return fmt.Errorf("profile name is set to %s, but username is not provided", profileName)
}
var username string
if handle != profilemanager.DefaultProfileName {
if profileName != "default" {
username = *userName
}
resolved, err := s.resolveProfileHandle(handle, username)
if err != nil {
return nil, err
}
if resolved.ID != activeProf.ID || username != activeProf.Username {
if profileName != activeProf.Name || username != activeProf.Username {
if s.checkProfilesDisabled() {
log.Errorf("profiles are disabled, you cannot use this feature without profiles enabled")
return nil, gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
return gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
}
log.Infof("switching to profile %s (%s) for user %s", resolved.Name, resolved.ID, username)
log.Infof("switching to profile %s for user %s", profileName, username)
if err := s.profileManager.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: resolved.ID,
Name: profileName,
Username: username,
}); err != nil {
log.Errorf("failed to set active profile state: %v", err)
return nil, fmt.Errorf("failed to set active profile state: %w", err)
return fmt.Errorf("failed to set active profile state: %w", err)
}
}
return resolved, nil
return nil
}
// SwitchProfile switches the active profile in the daemon.
@@ -911,9 +906,9 @@ func (s *Server) SwitchProfile(callerCtx context.Context, msg *proto.SwitchProfi
}
if msg != nil && msg.ProfileName != nil {
if _, err := s.switchProfileIfNeeded(*msg.ProfileName, msg.Username, activeProf); err != nil {
if err := s.switchProfileIfNeeded(*msg.ProfileName, msg.Username, activeProf); err != nil {
log.Errorf("failed to switch profile: %v", err)
return nil, err
return nil, fmt.Errorf("failed to switch profile: %w", err)
}
}
activeProf, err = s.profileManager.GetActiveProfileState()
@@ -929,7 +924,7 @@ func (s *Server) SwitchProfile(callerCtx context.Context, msg *proto.SwitchProfi
s.config = config
return &proto.SwitchProfileResponse{Id: activeProf.ID.String()}, nil
return &proto.SwitchProfileResponse{}, nil
}
// Down engine work in the daemon.
@@ -993,10 +988,6 @@ func (s *Server) cleanupConnection() error {
return nil
}
// TODO: consider calling s.connectClient.Stop() instead of engine.Stop().
// actCancel() lets the run loop stop the engine too, so both stop it
// concurrently; ConnectClient.Stop cancels and waits for the run loop,
// making the run loop the sole owner of engine shutdown.
if engine != nil {
if err := engine.Stop(); err != nil {
return err
@@ -1023,27 +1014,22 @@ func (s *Server) Logout(ctx context.Context, msg *proto.LogoutRequest) (*proto.L
}
func (s *Server) handleProfileLogout(ctx context.Context, msg *proto.LogoutRequest) (*proto.LogoutResponse, error) {
if err := s.validateProfileOperation(*msg.ProfileName, true); err != nil {
return nil, err
}
if msg.Username == nil || *msg.Username == "" {
return nil, gstatus.Errorf(codes.InvalidArgument, "username must be provided when profile name is specified")
}
username := *msg.Username
resolved, err := s.resolveProfileHandle(*msg.ProfileName, username)
if err != nil {
return nil, err
}
if err := s.validateProfileOperation(resolved.ID, true); err != nil {
return nil, err
}
if err := s.logoutFromProfile(ctx, resolved); err != nil {
log.Errorf("failed to logout from profile %s: %v", resolved.ID, err)
if err := s.logoutFromProfile(ctx, *msg.ProfileName, username); err != nil {
log.Errorf("failed to logout from profile %s: %v", *msg.ProfileName, err)
return nil, gstatus.Errorf(codes.Internal, "logout: %v", err)
}
activeProf, _ := s.profileManager.GetActiveProfileState()
if activeProf != nil && activeProf.ID == resolved.ID {
if activeProf != nil && activeProf.Name == *msg.ProfileName {
if err := s.cleanupConnection(); err != nil && !errors.Is(err, ErrServiceNotUp) {
log.Errorf("failed to cleanup connection: %v", err)
}
@@ -1105,30 +1091,30 @@ func (s *Server) getConfig(activeProf *profilemanager.ActiveProfileState) (*prof
return config, configExisted, nil
}
func (s *Server) canRemoveProfile(id profilemanager.ID) error {
if id == profilemanager.DefaultProfileName {
func (s *Server) canRemoveProfile(profileName string) error {
if profileName == profilemanager.DefaultProfileName {
return fmt.Errorf("remove profile with reserved name: %s", profilemanager.DefaultProfileName)
}
activeProf, err := s.profileManager.GetActiveProfileState()
if err == nil && activeProf.ID == id {
return fmt.Errorf("remove active profile: %s", id)
if err == nil && activeProf.Name == profileName {
return fmt.Errorf("remove active profile: %s", profileName)
}
return nil
}
func (s *Server) validateProfileOperation(id profilemanager.ID, allowActiveProfile bool) error {
func (s *Server) validateProfileOperation(profileName string, allowActiveProfile bool) error {
if s.checkProfilesDisabled() {
return gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
}
if id == "" {
if profileName == "" {
return gstatus.Errorf(codes.InvalidArgument, "profile name must be provided")
}
if !allowActiveProfile {
if err := s.canRemoveProfile(id); err != nil {
if err := s.canRemoveProfile(profileName); err != nil {
return gstatus.Errorf(codes.InvalidArgument, "%v", err)
}
}
@@ -1136,20 +1122,25 @@ func (s *Server) validateProfileOperation(id profilemanager.ID, allowActiveProfi
return nil
}
func (s *Server) logoutFromProfile(ctx context.Context, profile *profilemanager.Profile) error {
// logoutFromProfile logs out from a specific profile by loading its config and sending logout request
func (s *Server) logoutFromProfile(ctx context.Context, profileName, username string) error {
activeProf, err := s.profileManager.GetActiveProfileState()
if err == nil && activeProf.ID == profile.ID && s.connectClient != nil {
if err == nil && activeProf.Name == profileName && s.connectClient != nil {
return s.sendLogoutRequest(ctx)
}
cfgPath := profile.Path
if cfgPath == "" {
cfgPath = profilemanager.DefaultConfigPath
profileState := &profilemanager.ActiveProfileState{
Name: profileName,
Username: username,
}
profilePath, err := profileState.FilePath()
if err != nil {
return fmt.Errorf("get profile path: %w", err)
}
config, err := profilemanager.GetConfig(cfgPath)
config, err := profilemanager.GetConfig(profilePath)
if err != nil {
return fmt.Errorf("profile '%s' not found", profile.ID)
return fmt.Errorf("profile '%s' not found", profileName)
}
return s.sendLogoutRequestWithConfig(ctx, config)
@@ -1567,14 +1558,15 @@ func (s *Server) GetConfig(ctx context.Context, req *proto.GetConfigRequest) (*p
return nil, ctx.Err()
}
resolved, err := s.resolveProfileHandle(req.ProfileName, req.Username)
if err != nil {
log.Errorf("failed to resolve profile %q: %v", req.ProfileName, err)
return nil, err
prof := profilemanager.ActiveProfileState{
Name: req.ProfileName,
Username: req.Username,
}
cfgPath := resolved.Path
if cfgPath == "" {
cfgPath = profilemanager.DefaultConfigPath
cfgPath, err := prof.FilePath()
if err != nil {
log.Errorf("failed to get active profile file path: %v", err)
return nil, fmt.Errorf("failed to get active profile file path: %w", err)
}
cfg, err := profilemanager.GetConfig(cfgPath)
@@ -1679,39 +1671,12 @@ func (s *Server) AddProfile(ctx context.Context, msg *proto.AddProfileRequest) (
return nil, gstatus.Errorf(codes.InvalidArgument, "profile name and username must be provided")
}
created, err := s.profileManager.AddProfile(msg.ProfileName, msg.Username)
if err != nil {
if err := s.profileManager.AddProfile(msg.ProfileName, msg.Username); err != nil {
log.Errorf("failed to create profile: %v", err)
return nil, fmt.Errorf("failed to create profile: %w", err)
}
return &proto.AddProfileResponse{Id: created.ID.String()}, nil
}
func (s *Server) RenameProfile(ctx context.Context, msg *proto.RenameProfileRequest) (*proto.RenameProfileResponse, error) {
s.mutex.Lock()
defer s.mutex.Unlock()
if s.checkProfilesDisabled() {
return nil, gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
}
if msg.Handle == "" || msg.Username == "" || msg.NewProfileName == "" {
return nil, gstatus.Errorf(codes.InvalidArgument, "profile name, username and new profile name must be provided")
}
resolved, err := s.resolveProfileHandle(msg.Handle, msg.Username)
if err != nil {
return nil, err
}
err = s.profileManager.RenameProfile(resolved.ID, msg.Username, msg.NewProfileName)
if err != nil {
log.Errorf("failed to rename profile: %v", err)
return nil, fmt.Errorf("failed to rename profile: %w", err)
}
return &proto.RenameProfileResponse{OldProfileName: resolved.Name}, nil
return &proto.AddProfileResponse{}, nil
}
// RemoveProfile removes a profile from the daemon.
@@ -1719,29 +1684,20 @@ func (s *Server) RemoveProfile(ctx context.Context, msg *proto.RemoveProfileRequ
s.mutex.Lock()
defer s.mutex.Unlock()
if s.checkProfilesDisabled() {
return nil, gstatus.Errorf(codes.Unavailable, errProfilesDisabled)
}
if msg.ProfileName == "" {
return nil, gstatus.Errorf(codes.InvalidArgument, "profile name must be provided")
}
resolved, err := s.resolveProfileHandle(msg.ProfileName, msg.Username)
if err != nil {
if err := s.validateProfileOperation(msg.ProfileName, false); err != nil {
return nil, err
}
if err := s.logoutFromProfile(ctx, resolved); err != nil {
log.Warnf("failed to logout from profile %s before removal: %v", resolved.ID, err)
if err := s.logoutFromProfile(ctx, msg.ProfileName, msg.Username); err != nil {
log.Warnf("failed to logout from profile %s before removal: %v", msg.ProfileName, err)
}
if err := s.profileManager.RemoveProfile(resolved.ID, msg.Username); err != nil {
if err := s.profileManager.RemoveProfile(msg.ProfileName, msg.Username); err != nil {
log.Errorf("failed to remove profile: %v", err)
return nil, fmt.Errorf("failed to remove profile: %w", err)
}
return &proto.RemoveProfileResponse{Id: resolved.ID.String()}, nil
return &proto.RemoveProfileResponse{}, nil
}
// ListProfiles lists all profiles in the daemon.
@@ -1764,7 +1720,6 @@ func (s *Server) ListProfiles(ctx context.Context, msg *proto.ListProfilesReques
}
for i, profile := range profiles {
response.Profiles[i] = &proto.Profile{
Id: profile.ID.String(),
Name: profile.Name,
IsActive: profile.IsActive,
}
@@ -1773,9 +1728,7 @@ func (s *Server) ListProfiles(ctx context.Context, msg *proto.ListProfilesReques
return response, nil
}
// GetActiveProfile returns the active profile in the daemon. The ProfileName
// field carries the display name for backwards compatibility with UI clients,
// new callers should prefer Id.
// GetActiveProfile returns the active profile in the daemon.
func (s *Server) GetActiveProfile(ctx context.Context, msg *proto.GetActiveProfileRequest) (*proto.GetActiveProfileResponse, error) {
s.mutex.Lock()
defer s.mutex.Unlock()
@@ -1786,23 +1739,9 @@ func (s *Server) GetActiveProfile(ctx context.Context, msg *proto.GetActiveProfi
return nil, fmt.Errorf("failed to get active profile state: %w", err)
}
// Fallback to legacy name == ID
displayName := activeProfile.ID.String()
if activeProfile.ID != profilemanager.DefaultProfileName {
if profiles, lerr := s.profileManager.ListProfiles(activeProfile.Username); lerr == nil {
for _, p := range profiles {
if p.ID == activeProfile.ID {
displayName = p.Name
break
}
}
}
}
return &proto.GetActiveProfileResponse{
ProfileName: displayName,
ProfileName: activeProfile.Name,
Username: activeProfile.Username,
Id: activeProfile.ID.String(),
}, nil
}

View File

@@ -97,7 +97,7 @@ func TestConnectWithRetryRuns(t *testing.T) {
pm := profilemanager.ServiceManager{}
err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: "test-profile",
Name: "test-profile",
Username: currUser.Username,
})
if err != nil {
@@ -158,7 +158,7 @@ func TestServer_Up(t *testing.T) {
pm := profilemanager.ServiceManager{}
err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: profilemanager.ID(profName),
Name: profName,
Username: currUser.Username,
})
if err != nil {
@@ -228,7 +228,7 @@ func TestServer_SubcribeEvents(t *testing.T) {
pm := profilemanager.ServiceManager{}
err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: "default",
Name: "default",
Username: currUser.Username,
})
if err != nil {

View File

@@ -62,7 +62,7 @@ func setupServerWithProfile(t *testing.T) (s *Server, ctx context.Context, profN
pm := profilemanager.ServiceManager{}
require.NoError(t, pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: profilemanager.ID(profName),
Name: profName,
Username: currUser.Username,
}))
@@ -107,9 +107,9 @@ func TestSetConfig_MDMReject_SingleField(t *testing.T) {
func TestSetConfig_MDMReject_MultipleFields(t *testing.T) {
withMDMPolicy(t, mdm.NewPolicy(map[string]any{
mdm.KeyManagementURL: "https://mdm.example.com:443",
mdm.KeyBlockInbound: true,
mdm.KeyRosenpassEnabled: true,
mdm.KeyManagementURL: "https://mdm.example.com:443",
mdm.KeyBlockInbound: true,
mdm.KeyRosenpassEnabled: true,
}))
s, ctx, profName, username, _ := setupServerWithProfile(t)

View File

@@ -47,7 +47,7 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
pm := profilemanager.ServiceManager{}
err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
ID: profilemanager.ID(profName),
Name: profName,
Username: currUser.Username,
})
require.NoError(t, err)
@@ -96,7 +96,7 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
DisableNotifications: &disableNotifications,
LazyConnectionEnabled: &lazyConnectionEnabled,
BlockInbound: &blockInbound,
DisableIpv6: &disableIPv6,
DisableIpv6: &disableIPv6,
NatExternalIPs: []string{"1.2.3.4", "5.6.7.8"},
CleanNATExternalIPs: false,
CustomDNSAddress: []byte("1.1.1.1:53"),
@@ -112,7 +112,7 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
require.NoError(t, err)
profState := profilemanager.ActiveProfileState{
ID: profilemanager.ID(profName),
Name: profName,
Username: currUser.Username,
}
cfgPath, err := profState.FilePath()

View File

@@ -98,7 +98,6 @@ type RelayStateOutputDetail struct {
URI string `json:"uri" yaml:"uri"`
Available bool `json:"available" yaml:"available"`
Error string `json:"error" yaml:"error"`
Transport string `json:"transport,omitempty" yaml:"transport,omitempty"`
}
type RelayStateOutput struct {
@@ -220,8 +219,7 @@ func mapRelays(relays []*proto.RelayState) RelayStateOutput {
RelayStateOutputDetail{
URI: relay.URI,
Available: available,
Error: relayErrorString(relay.GetError()),
Transport: relay.GetTransport(),
Error: relay.GetError(),
},
)
@@ -237,12 +235,6 @@ func mapRelays(relays []*proto.RelayState) RelayStateOutput {
}
}
// relayErrorString flattens a newline-joined aggregated relay error onto a
// single line for status output.
func relayErrorString(s string) string {
return strings.ReplaceAll(s, "\n", "; ")
}
func mapNSGroups(servers []*proto.NSGroupState) []NsServerGroupStateOutput {
mappedNSGroups := make([]NsServerGroupStateOutput, 0, len(servers))
for _, pbNsGroupServer := range servers {
@@ -449,8 +441,6 @@ func (o *OutputOverview) GeneralSummary(showURL bool, showRelays bool, showNameS
available = "Unavailable"
reason = fmt.Sprintf(", reason: %s", relay.Error)
}
} else if relay.Transport != "" {
available = fmt.Sprintf("%s via %s", available, relay.Transport)
}
relaysString += fmt.Sprintf("\n [%s] is %s%s", relay.URI, available, reason)

View File

@@ -647,13 +647,3 @@ func TestTimeAgo(t *testing.T) {
})
}
}
func TestMapRelaysTransport(t *testing.T) {
out := mapRelays([]*proto.RelayState{
{URI: "rels://relay.example:443", Available: true, Transport: "quic"},
{URI: "rels://relay2.example:443", Available: true, Transport: "ws"},
})
require.Len(t, out.Details, 2)
assert.Equal(t, "quic", out.Details[0].Transport)
assert.Equal(t, "ws", out.Details[1].Transport)
}

View File

@@ -418,14 +418,7 @@ func newServiceClient(args *newServiceClientArgs) *serviceClient {
case args.showProfiles:
s.showProfilesUI()
case args.showQuickActions:
// Suppress the on-boot Quick Actions popup when the daemon
// reports DisableAutoConnect=true — that flag carries both the
// user's "Connect on Startup = off" preference AND any MDM-
// enforced override (applyMDMPolicy writes the policy value
// into the same Config field). See netbirdio/netbird#5744.
if !s.disableAutoConnectFromDaemon() {
s.showQuickActionsUI()
}
s.showQuickActionsUI()
case args.showUpdate:
s.showUpdateProgress(ctx, args.showUpdateVersion)
}
@@ -652,7 +645,7 @@ func (s *serviceClient) buildSetConfigRequest(iMngURL string, port, mtu int64) (
}
req := &proto.SetConfigRequest{
ProfileName: activeProf.ID.String(),
ProfileName: activeProf.Name,
Username: currUser.Username,
}
@@ -825,15 +818,13 @@ func (s *serviceClient) login(ctx context.Context, openURL bool) (*proto.LoginRe
return nil, fmt.Errorf("get current user: %w", err)
}
handle := activeProf.ID.String()
loginReq := &proto.LoginRequest{
IsUnixDesktopClient: runtime.GOOS == "linux" || runtime.GOOS == "freebsd",
ProfileName: &handle,
ProfileName: &activeProf.Name,
Username: &currUser.Username,
}
profileState, err := s.profileManager.GetProfileState(activeProf.ID)
profileState, err := s.profileManager.GetProfileState(activeProf.Name)
if err != nil {
log.Debugf("failed to get profile state for login hint: %v", err)
} else if profileState.Email != "" {
@@ -1345,40 +1336,6 @@ func (s *serviceClient) getFeatures() (*proto.GetFeaturesResponse, error) {
return features, nil
}
// disableAutoConnectFromDaemon returns true when the daemon reports
// the active profile has DisableAutoConnect=true. Used by the
// --quick-actions startup path to suppress the on-boot popup when the
// user (or an MDM admin) opted out of auto-connecting; both cases
// converge on the same Config field because applyMDMPolicy writes the
// policy value into it. Returns false on any RPC / lookup failure so a
// daemon hiccup does not silently swallow the popup.
func (s *serviceClient) disableAutoConnectFromDaemon() bool {
activeProf, err := s.profileManager.GetActiveProfile()
if err != nil {
log.Warnf("disableAutoConnectFromDaemon: get active profile: %v", err)
return false
}
currUser, err := user.Current()
if err != nil {
log.Warnf("disableAutoConnectFromDaemon: get current user: %v", err)
return false
}
conn, err := s.getSrvClient(failFastTimeout)
if err != nil {
log.Warnf("disableAutoConnectFromDaemon: get daemon client: %v", err)
return false
}
srvCfg, err := conn.GetConfig(s.ctx, &proto.GetConfigRequest{
ProfileName: activeProf.ID.String(),
Username: currUser.Username,
})
if err != nil {
log.Warnf("disableAutoConnectFromDaemon: GetConfig RPC: %v", err)
return false
}
return srvCfg.GetDisableAutoConnect()
}
// getSrvConfig from the service to show it in the settings window.
func (s *serviceClient) getSrvConfig() {
s.managementURL = profilemanager.DefaultManagementURL
@@ -1410,7 +1367,7 @@ func (s *serviceClient) getSrvConfig() {
}
srvCfg, err := conn.GetConfig(s.ctx, &proto.GetConfigRequest{
ProfileName: activeProf.ID.String(),
ProfileName: activeProf.Name,
Username: currUser.Username,
})
if err != nil {
@@ -1656,7 +1613,7 @@ func (s *serviceClient) loadSettings() {
}
cfg, err := conn.GetConfig(s.ctx, &proto.GetConfigRequest{
ProfileName: activeProf.ID.String(),
ProfileName: activeProf.Name,
Username: currUser.Username,
})
if err != nil {
@@ -1856,7 +1813,7 @@ func (s *serviceClient) updateConfig() error {
}
req := proto.SetConfigRequest{
ProfileName: activeProf.ID.String(),
ProfileName: activeProf.Name,
Username: currUser.Username,
DisableAutoConnect: &disableAutoStart,
ServerSSHAllowed: &sshAllowed,

View File

@@ -66,7 +66,7 @@ func (s *serviceClient) showProfilesUI() {
} else {
indicator.SetText("")
}
nameLabel.SetText(formatProfileLabel(profile, profiles))
nameLabel.SetText(profile.Name)
// Configure Select/Active button
selectBtn.SetText(func() string {
@@ -88,7 +88,7 @@ func (s *serviceClient) showProfilesUI() {
return
}
// switch
err = s.switchProfile(profile.ID)
err = s.switchProfile(profile.Name)
if err != nil {
log.Errorf("failed to switch profile: %v", err)
dialog.ShowError(errors.New("failed to select profile"), s.wProfiles)
@@ -130,7 +130,7 @@ func (s *serviceClient) showProfilesUI() {
logoutBtn.Show()
logoutBtn.SetText("Deregister")
logoutBtn.OnTapped = func() {
s.handleProfileLogout(profile, refresh)
s.handleProfileLogout(profile.Name, refresh)
}
// Remove profile
@@ -144,7 +144,7 @@ func (s *serviceClient) showProfilesUI() {
return
}
err = s.removeProfile(profile.ID)
err = s.removeProfile(profile.Name)
if err != nil {
log.Errorf("failed to remove profile: %v", err)
dialog.ShowError(fmt.Errorf("failed to remove profile"), s.wProfiles)
@@ -250,7 +250,7 @@ func (s *serviceClient) addProfile(profileName string) error {
return nil
}
func (s *serviceClient) switchProfile(handle string) error {
func (s *serviceClient) switchProfile(profileName string) error {
conn, err := s.getSrvClient(defaultFailTimeout)
if err != nil {
return fmt.Errorf(getClientFMT, err)
@@ -261,15 +261,15 @@ func (s *serviceClient) switchProfile(handle string) error {
return fmt.Errorf("get current user: %w", err)
}
resp, err := conn.SwitchProfile(s.ctx, &proto.SwitchProfileRequest{
ProfileName: &handle,
if _, err := conn.SwitchProfile(s.ctx, &proto.SwitchProfileRequest{
ProfileName: &profileName,
Username: &currUser.Username,
})
if err != nil {
}); err != nil {
return fmt.Errorf("switch profile failed: %w", err)
}
if err := s.profileManager.SwitchProfile(profilemanager.ID(resp.Id)); err != nil {
err = s.profileManager.SwitchProfile(profileName)
if err != nil {
return fmt.Errorf("switch profile: %w", err)
}
@@ -299,27 +299,10 @@ func (s *serviceClient) removeProfile(profileName string) error {
}
type Profile struct {
ID string
Name string
IsActive bool
}
// formatProfileLabel returns the display label for a profile. Profiles can
// share the same Name, so when more than one profile in profiles carries this
// Name, a short form of the ID is appended to disambiguate the entries.
func formatProfileLabel(profile Profile, profiles []Profile) string {
count := 0
for _, p := range profiles {
if p.Name == profile.Name {
count++
}
}
if count <= 1 {
return profile.Name
}
return fmt.Sprintf("%s (%s)", profile.Name, profilemanager.ID(profile.ID).ShortID())
}
func (s *serviceClient) getProfiles() ([]Profile, error) {
conn, err := s.getSrvClient(defaultFailTimeout)
if err != nil {
@@ -341,7 +324,6 @@ func (s *serviceClient) getProfiles() ([]Profile, error) {
for _, profile := range profilesResp.Profiles {
profiles = append(profiles, Profile{
ID: profile.Id,
Name: profile.Name,
IsActive: profile.IsActive,
})
@@ -350,10 +332,10 @@ func (s *serviceClient) getProfiles() ([]Profile, error) {
return profiles, nil
}
func (s *serviceClient) handleProfileLogout(profile Profile, refreshCallback func()) {
func (s *serviceClient) handleProfileLogout(profileName string, refreshCallback func()) {
dialog.ShowConfirm(
"Deregister",
fmt.Sprintf("Are you sure you want to deregister from '%s'?", profile.Name),
fmt.Sprintf("Are you sure you want to deregister from '%s'?", profileName),
func(confirm bool) {
if !confirm {
return
@@ -374,10 +356,8 @@ func (s *serviceClient) handleProfileLogout(profile Profile, refreshCallback fun
}
username := currUser.Username
// ProfileName is treated as a handle; send the ID so the
// daemon resolves to exactly this profile.
_, err = conn.Logout(s.ctx, &proto.LogoutRequest{
ProfileName: &profile.ID,
ProfileName: &profileName,
Username: &username,
})
if err != nil {
@@ -388,7 +368,7 @@ func (s *serviceClient) handleProfileLogout(profile Profile, refreshCallback fun
dialog.ShowInformation(
"Deregistered",
fmt.Sprintf("Successfully deregistered from '%s'", profile.Name),
fmt.Sprintf("Successfully deregistered from '%s'", profileName),
s.wProfiles,
)
@@ -481,7 +461,6 @@ func (p *profileMenu) getProfiles() ([]Profile, error) {
for _, profile := range profilesResp.Profiles {
profiles = append(profiles, Profile{
ID: profile.Id,
Name: profile.Name,
IsActive: profile.IsActive,
})
@@ -522,7 +501,7 @@ func (p *profileMenu) refresh() {
}
if activeProf.ProfileName == "default" || activeProf.Username == currUser.Username {
activeProfState, err := p.profileManager.GetProfileState(profilemanager.ID(activeProf.Id))
activeProfState, err := p.profileManager.GetProfileState(activeProf.ProfileName)
if err != nil {
log.Warnf("failed to get active profile state: %v", err)
p.emailMenuItem.Hide()
@@ -533,7 +512,7 @@ func (p *profileMenu) refresh() {
}
for _, profile := range profiles {
item := p.profileMenuItem.AddSubMenuItem(formatProfileLabel(profile, profiles), "")
item := p.profileMenuItem.AddSubMenuItem(profile.Name, "")
if profile.IsActive {
item.Check()
}
@@ -562,8 +541,8 @@ func (p *profileMenu) refresh() {
return
}
switchResp, err := conn.SwitchProfile(ctx, &proto.SwitchProfileRequest{
ProfileName: &profile.ID,
_, err = conn.SwitchProfile(ctx, &proto.SwitchProfileRequest{
ProfileName: &profile.Name,
Username: &currUser.Username,
})
if err != nil {
@@ -573,7 +552,7 @@ func (p *profileMenu) refresh() {
return
}
err = p.profileManager.SwitchProfile(profilemanager.ID(switchResp.Id))
err = p.profileManager.SwitchProfile(profile.Name)
if err != nil {
log.Errorf("failed to switch profile '%s': %v", profile.Name, err)
return
@@ -748,10 +727,7 @@ func (p *profileMenu) updateMenu() {
}
sort.Slice(profiles, func(i, j int) bool {
if profiles[i].Name != profiles[j].Name {
return profiles[i].Name < profiles[j].Name
}
return profiles[i].ID < profiles[j].ID
return profiles[i].Name < profiles[j].Name
})
p.mu.Lock()

View File

@@ -2,5 +2,4 @@ FROM ubuntu:24.04
RUN apt update && apt install -y ca-certificates && rm -fr /var/cache/apt
ENTRYPOINT [ "/go/bin/netbird-server" ]
CMD ["--config", "/etc/netbird/config.yaml"]
ARG TARGETPLATFORM
COPY ${TARGETPLATFORM}/netbird-server /go/bin/netbird-server
COPY netbird-server /go/bin/netbird-server

View File

@@ -1,109 +0,0 @@
# Agent Networks — overview
Single-entry point. Feature scope, the module map, and the cross-cutting
topics worth keeping in mind, with links into every per-module guide.
## TL;DR
Agent Networks introduces an **LLM-aware reverse-proxy middleware system**
plus **account-level controls** (budget rules, log collection toggles,
PII redaction). The management server synthesises a per-peer middleware
chain that the proxy executes on every LLM request; the chain enforces
quotas, injects identity, redacts PII, parses tokens/cost, and emits
access-log entries. The dashboard exposes the surface as a single **AI
Observability** page with four tabs.
- **Backend** lives in this repo, primarily under
`management/server/agentnetwork`, `proxy/internal/middleware`, and
`proxy/internal/llm`, with wire contracts in `shared/management`.
- **Dashboard** lives in the dashboard repo under
`src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`.
## Reading order
| # | Doc | Why |
|---|-----|-----|
| 1 | [01-end-to-end-flows.md](01-end-to-end-flows.md) | Get the three big diagrams in your head first. |
| 2 | [modules/10-shared-api.md](modules/10-shared-api.md) | Wire contracts — every other module either produces or consumes these. |
| 3 | [modules/21-management-agentnetwork.md](modules/21-management-agentnetwork.md) | The largest module; everything the proxy executes originates here. |
| 4 | [modules/30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md) | The generic plugin system on the proxy side. |
| 5 | [modules/31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md) | The 8 LLM middlewares that ride on the framework. |
| 6 | Everything else in any order. | |
## Module map
11 modules. Each is described in detail in its own file under
[`modules/`](modules/).
| # | Module | Risk | BC impact |
|---|--------|------|-----------|
| 10 | [shared/api](modules/10-shared-api.md) — proto + OpenAPI | Low | Additive only |
| 20 | [management/store](modules/20-management-store.md) — SQL persistence | Medium | Auto-migrate (additive) |
| 21 | [management/agentnetwork](modules/21-management-agentnetwork.md) — domain layer + synthesizer | **High** | Additive |
| 22 | [management/handlers + wiring](modules/22-management-handlers-wiring.md) — HTTP API + gRPC delivery | Medium | Additive |
| 30 | [proxy/middleware-framework](modules/30-proxy-middleware-framework.md) — generic plugin system | High | Additive |
| 31 | [proxy/middleware-builtin](modules/31-proxy-middleware-builtin.md) — 8 LLM middlewares | High | Additive |
| 32 | [proxy/llm-parsers](modules/32-proxy-llm-parsers.md) — SDK adapters + pricing | Medium | Additive |
| 33 | [proxy/runtime](modules/33-proxy-runtime.md) — translate + serve + access-log | High | Additive (touches hot path) |
| 40 | [dashboard](modules/40-dashboard.md) — UI for everything above | Medium | Sidebar reshape |
| 50 | [path-routed-providers](modules/50-path-routed-providers.md) — Vertex AI + Bedrock | Medium | Additive (new catalog entries) |
The largest and highest-risk module is `management/agentnetwork`: it is
the single writer of the middleware chain the proxy executes.
## Cross-cutting topics
These are the items most likely to bite production. Each is fully
documented in the linked module guide.
1. **Capture-pointer semantics** (`*bool` for `capture_prompt` and
`capture_completion`): nil = legacy emit, false = suppress, true =
emit. nil-vs-false must be handled at every JSON hop. See
[21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
and [31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
2. **`ProxyMapping.Private` preservation** on per-proxy live updates.
Failure mode: `auth` skips `ValidateTunnelPeer`
`CapturedData.UserGroups` empty → `llm_router` denies. See
[33-proxy-runtime.md](modules/33-proxy-runtime.md).
3. **respInput carrying `UserEmail`/`UserGroups`/`UserGroupNames` onto
the response leg** in `reverseproxy.go`. Load-bearing wire that lets
`llm_limit_record` ship non-empty `group_ids` on `RecordLLMUsage`. See
[33-proxy-runtime.md](modules/33-proxy-runtime.md).
4. **Min-wins all-must-pass budget rule semantics**. Every matching
rule's remaining quota must be > 0 for the request to proceed; one
exhausted rule blocks the whole call. Documented in
[21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
and the `llm_limit_check` middleware in
[31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
5. **body-tap memory bounds**: per-direction 1 MiB cap, shared 256 MiB
budget, `LimitReader(r.Body, limit+1)` for truncation detection with
`replayReadCloser` fallback so upstream still sees the full body.
`cloneInputFor` deep-copies the body up to 16 times per chain — a
perf hot-spot. See
[30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
6. **UpstreamRewrite.AuthHeader bypasses the header denylist**
deliberately. The runtime consumer only unpacks it via the
trusted upstream-build path. See
[30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
7. **`disable_access_log` default-false semantics**: the synth target
sets it true, all other targets leave it false. See
[10-shared-api.md](modules/10-shared-api.md).
8. **String-typed `decision` / `deny_code`** on
`CheckLLMPolicyLimitsResponse` — would benefit from enum pinning
before external consumers integrate. See
[10-shared-api.md](modules/10-shared-api.md).
## Explicit non-goals
- **Reaper / GC pass over stale synth services** — designed but cut from
scope.
- **URL-sync for tab state on AI Observability** — read path is wired
(`?tab=`) but write path isn't. Future work.
- **CI golden-file regen-and-diff for `types.gen.go` /
`proxy_service.pb.go`** — would catch codegen drift; not yet in place.
## Where to read the code
Per-module file scopes are listed in each module guide. Behaviour is
covered by Go tests co-located with each package (and an end-to-end
chain integration test under `proxy/internal/proxy`).

View File

@@ -1,217 +0,0 @@
# End-to-end flows
Three cross-module mermaid diagrams. Each per-module guide repeats the
slice that's relevant to its own scope — these are the canonical
top-down views.
- [Flow A — Config → runtime (synth + deliver)](#flow-a--config--runtime-synth--deliver)
- [Flow B — Request lifecycle through the LLM chain](#flow-b--request-lifecycle-through-the-llm-chain)
- [Flow C — Budget rule feedback loop](#flow-c--budget-rule-feedback-loop)
---
## Flow A — Config → runtime (synth + deliver)
How an operator's change to a Provider, Policy, Guardrail, Budget Rule,
or Settings record ends up as live middleware on a peer's proxy.
```mermaid
sequenceDiagram
autonumber
actor Op as Operator
participant UI as Dashboard
participant HTTP as management/handlers
participant Mgr as agentnetwork.Manager
participant Store as management/store (SQL)
participant Ctl as network_map.Controller
participant Synth as agentnetwork.SynthesizeServices
participant Grpc as management gRPC
participant Proxy as netbird-proxy
participant Xlate as middleware_translate
participant Chain as middleware.Chain
Op->>UI: edit provider/policy/budget/settings
UI->>HTTP: REST PUT/POST /api/agent-network/*
HTTP->>Mgr: SaveProvider / SavePolicy / SaveBudgetRule / SaveSettings
Mgr->>Store: persist (gorm)
Mgr-->>Ctl: account change event (Network-Map dirty)
loop per connected peer
Ctl->>Synth: SynthesizeServices(ctx, store, accountID)
Synth->>Store: load providers, policies, guardrails, budget rules, settings
Synth-->>Synth: build per-peer Service list
Note over Synth: each Service has a middleware<br/>chain with capture_prompt /<br/>capture_completion / redact_pii<br/>baked from account settings
Synth-->>Ctl: []rpservice.Service
Ctl->>Grpc: NetworkMap push (services + middleware configs)
end
Grpc-->>Proxy: NetworkMap stream
Proxy->>Xlate: translate proto MiddlewareConfig → runtime Spec
Xlate->>Chain: register / replace per-service chain
Note over Chain: chain replacement is live<br/>(no proxy restart, in-flight<br/>requests unaffected)
```
**Notes on the diagram**
- The `network_map.Controller` synthesises on every push, not on a
timer. A single config change costs O(connected peers × policies ×
providers) per push. See [`modules/22-management-handlers-wiring.md`](modules/22-management-handlers-wiring.md).
- `SynthesizeServices` is the single source of truth for the wire
format the proxy executes. Anything the proxy does that the
synthesiser didn't request is a bug. See
[`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
- The translate step (step 13) is the only place that knows the
middleware-ID strings on the proxy side. It must reject unknown IDs;
silently dropping middlewares would create a security gap (e.g.
missing `llm_limit_check` ⇒ unbounded spend). See
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
---
## Flow B — Request lifecycle through the LLM chain
What happens when an agent on the client peer sends a chat-completion /
messages request through the synthesised reverse-proxy.
```mermaid
sequenceDiagram
autonumber
actor Agent as Agent (local)
participant Px as netbird-proxy
participant Auth as auth middleware
participant Map as service-mapping
participant Req as llm_request_parser
participant Rt as llm_router
participant Chk as llm_limit_check
participant Inj as llm_identity_inject
participant Grd as llm_guardrail
participant Up as upstream LLM
participant Resp as llm_response_parser
participant Cost as cost_meter
participant Rec as llm_limit_record
participant Log as access-log
participant MgmtGrpc as management gRPC
Agent->>Px: POST /v1/chat/completions (OpenAI / Anthropic)
Px->>Auth: identify peer (user, groups)
Auth->>Map: resolve service from Host + path
Map-->>Req: dispatch chain in slot order
Req->>Req: parse body → provider, model, prompt, token estimate
Note over Req: capture_prompt gates raw_prompt<br/>capture (nil = legacy emit,<br/>false = drop, true = emit)
Req->>Rt: pass metadata
Rt->>Chk: route to upstream candidate
Chk->>MgmtGrpc: CheckLLMPolicyLimits(provider, model, est_tokens, groups, user)
MgmtGrpc-->>Chk: decision = allow / deny + deny_code
alt decision == deny
Chk-->>Log: emit access-log with deny_code<br/>(if EnableLogCollection)
Chk-->>Agent: 429 (or 403 per deny_code)
else decision == allow
Chk->>Inj: continue
Inj->>Inj: inject NetBird identity headers per provider config
Inj->>Grd: continue
Grd->>Grd: enforce model allowlist
Grd->>Up: forward (over WireGuard)
Up-->>Resp: response (JSON or SSE stream)
Resp->>Resp: parse usage tokens, completion
Note over Resp: capture_completion gates raw<br/>completion capture
Resp->>Cost: tokens
Cost->>Cost: lookup pricing.yaml + compute cost
Cost->>Rec: tokens + cost
Rec->>MgmtGrpc: RecordLLMUsage(provider, model, prompt_t, completion_t, cost, groups, user)
Rec-->>Log: emit access-log entry<br/>(if EnableLogCollection)
Log-->>Agent: 200 + body (streamed if SSE)
end
```
**Notes on the diagram**
- The chain runs in synth-defined order. Re-ordering middlewares
changes invariants — `llm_limit_check` must precede `llm_router` so
a denied request never hits upstream, and `llm_limit_record` must
pair with `llm_limit_check` so a successful check is always recorded
(or the rate-limit semantics break). See
[`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
- `llm_guardrail` is also where PII redaction happens
(`redact_pii = settings.RedactPii`). Phones, emails, credit cards,
PII names — see `redact.go` for the full set. See
[`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
- SSE streaming requires special handling on the response side; the
parser must handle partial chunks without buffering the whole
stream. See [`modules/32-proxy-llm-parsers.md`](modules/32-proxy-llm-parsers.md).
- Access-log emission is gated on `settings.EnableLogCollection`. With
it OFF, neither the deny nor the allow leg writes an entry — the
chain still runs (budget rules are still enforced) but no audit trail
is kept. See
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
---
## Flow C — Budget rule feedback loop
How an account's budget rules tighten ceilings on every request and how
consumption flows back into the dashboard.
```mermaid
flowchart LR
subgraph Operator
DashBud[Dashboard Budget Settings tab]
end
subgraph Mgmt[Management]
Save[POST/PUT /api/agent-network/budget-rules]
Store[(SQL store)]
Synth[SynthesizeServices]
Check[CheckLLMPolicyLimits RPC]
Rec[RecordLLMUsage RPC]
Cons[/api/agent-network/consumption]
end
subgraph Proxy[Proxy]
Chk[llm_limit_check]
RecMw[llm_limit_record]
end
subgraph DashView[Dashboard Budget Dashboard tab]
Panel[AgentConsumptionPanel]
end
DashBud -->|create / update rules| Save
Save --> Store
Store --> Synth
Synth -->|push synth-services to peer| Proxy
Chk -->|per request| Check
Check -->|aggregate matching rules<br/>min-wins all-must-pass| Store
Check -->|allow / deny| Chk
RecMw -->|post-response| Rec
Rec -->|tokens + cost + groups + user| Store
Store -->|read counters| Cons
Cons --> Panel
```
**Notes on the diagram**
- **min-wins all-must-pass** is the core semantic. A budget rule binds
to (group set, user set) with a (window, ceiling). At check time,
every rule that matches the caller is evaluated; if ANY rule has
zero remaining quota the request is denied. This is the most
surprising semantic for operators — see the invariants section of
[`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
- The proxy never makes its own budget decisions. It always asks
management via `CheckLLMPolicyLimits` and reports back via
`RecordLLMUsage`. This keeps account-wide accounting in one place
and avoids per-proxy drift.
- `RecordLLMUsage` must carry `group_ids` and `user_id` so the
decrement hits the right rule(s). The wire that carries those
fields onto the response leg is `respInput` in `reverseproxy.go`. See
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
- The dashboard's Budget Dashboard tab polls
`/api/agent-network/consumption` — not gRPC, not WebSocket. Poll
interval lives in `AgentConsumptionPanel.tsx`. See
[`modules/40-dashboard.md`](modules/40-dashboard.md).
---
## Cross-references
- Per-module guides: [`modules/`](modules/)
- Overview + module map: [`00-overview.md`](00-overview.md)

View File

@@ -1,66 +0,0 @@
# Agent Networks — architecture documentation
A self-contained set of documents describing the agent-networks feature:
an LLM-aware reverse-proxy middleware system plus account-level controls
(budget rules, log collection toggles, PII redaction). The management
server synthesises a per-peer middleware chain that the proxy executes on
every LLM request.
## What to read first
1. **[00-overview.md](00-overview.md)** — the single entry point. Feature
scope, the module map, and the cross-cutting topics worth keeping in
mind, with links to every per-module guide.
2. **[01-end-to-end-flows.md](01-end-to-end-flows.md)** — three
high-level mermaid diagrams: config-to-runtime synth/delivery,
per-request lifecycle through the LLM chain, and the budget-rule
feedback loop.
3. **Per-module guides** under `modules/` — one file per package. Each
describes the module boundary, the file-level layout, its own flow
diagrams, the public contracts, the invariants it relies on, and the
areas worth the closest attention.
## Directory layout
```
docs/agent-networks/
├── README.md # you are here
├── 00-overview.md # feature summary + module map
├── 01-end-to-end-flows.md # cross-module mermaid diagrams
└── modules/
├── 10-shared-api.md # proto + OpenAPI wire contracts
├── 20-management-store.md # SQL persistence layer
├── 21-management-agentnetwork.md # domain layer + synthesizer (largest)
├── 22-management-handlers-wiring.md # HTTP API + gRPC delivery
├── 30-proxy-middleware-framework.md # generic plugin system
├── 31-proxy-middleware-builtin.md # 8 LLM-aware middlewares
├── 32-proxy-llm-parsers.md # OpenAI/Anthropic/Bedrock SDKs + pricing
├── 33-proxy-runtime.md # translate + serve + access-log
├── 40-dashboard.md # UI for everything above (lives in the dashboard repo)
└── 50-path-routed-providers.md # Vertex AI + Bedrock (path-routed, keyfile:: creds, /bedrock prefix)
```
The `40-dashboard.md` module documents code that lives in the **dashboard
repo**, not in this repo. The guide is co-located here so backend readers
see the full picture in one place.
## How the per-module guides are structured
Every `modules/*.md` follows the same template so the docs are easy to
scan:
- **Module boundary** — what this package owns; where it sits in the stack.
- **Files** — path / role.
- **Architecture & flow** — one or more mermaid diagrams.
- **Public contracts** — function signatures, gRPC messages, JSON shapes.
- **Invariants** — semantic guarantees the module relies on or enforces.
- **Things to scrutinize** — split by correctness / security /
concurrency / backward-compat / performance / observability.
- **Test coverage** — the test files that lock down behaviour in this
module.
- **Known limitations / non-goals** — what is intentionally out of scope.
- **Cross-references** — upstream/downstream module links + the
end-to-end flow + the overview.
See [00-overview.md](00-overview.md) for the module map and the
cross-cutting topics.

View File

@@ -1,105 +0,0 @@
# shared/api — wire contracts (proto + OpenAPI)
> **Risk level:** Medium — wire-format surface that every other module pins against; backward-compat hinges on field-number discipline more than on logic correctness.
> **Backward-compat impact:** Additive only (new proto fields use unallocated numbers, new RPCs default to `Unimplemented`, new OpenAPI schemas/paths are append-only; no existing field/RPC/schema removed or renumbered).
## Module boundary
This module owns the cross-process contract surface between management, proxy, and dashboard. Two artefacts: `shared/management/proto/proxy_service.proto` (management↔proxy gRPC) and `shared/management/http/api/openapi.yml` (dashboard/CLI↔management REST). Both have generated companions checked in (`proxy_service.pb.go`, `proxy_service_grpc.pb.go`, `types.gen.go`) which must travel in lockstep with their sources. `shared/management/status/error.go` is in scope only for the four new typed `NotFound` constructors that the new HTTP handlers return.
Everything downstream — `management/agentnetwork`, `management/server/http/handlers/*`, `proxy/internal/*`, the dashboard SDK — consumes these types verbatim. The concern here is wire stability and codegen reproducibility, not behaviour: behaviour is covered in the management and proxy module guides.
`management.proto` and `signalexchange.proto` are unchanged. `status/error.go` only receives four additive constructors (lines 208-227); no existing error types are reshaped.
## Files
| Path | Role |
| ---- | ---- |
| `shared/management/proto/proxy_service.proto` | Source of truth: 2 new RPCs, 1 new message group (`MiddlewareConfig` + slot enum), additive fields on `PathTargetOptions`, `AccessLog`, `RecordLLMUsageRequest` |
| `shared/management/proto/proxy_service.pb.go` | Generated (protoc-gen-go) |
| `shared/management/proto/proxy_service_grpc.pb.go` | Generated; adds `CheckLLMPolicyLimits` + `RecordLLMUsage` client/server stubs and `UnimplementedProxyServiceServer` defaults |
| `shared/management/http/api/openapi.yml` | 15 new `AgentNetwork*` schemas, 9 new path groups under `/api/agent-network/*` |
| `shared/management/http/api/types.gen.go` | Generated (oapi-codegen; see codegen note below) |
| `shared/management/status/error.go` | Four `NotFound` constructors for the new resource kinds (lines 208-227) |
## Architecture & flow
```mermaid
sequenceDiagram
participant Dash as Dashboard / CLI
participant Mgmt as management (HTTP+gRPC)
participant Px as proxy
Note over Dash,Mgmt: REST (OpenAPI / types.gen.go)
Dash->>Mgmt: PUT /api/agent-network/providers (AgentNetworkProviderRequest)
Dash->>Mgmt: PUT /api/agent-network/settings (AgentNetworkSettingsRequest)
Dash->>Mgmt: GET /api/agent-network/consumption -> [AgentNetworkConsumption]
Note over Mgmt,Px: gRPC ProxyService (proxy_service.proto)
Mgmt-->>Px: SyncMappingsResponse{ ProxyMapping.path[*].options.middlewares,<br/>agent_network, disable_access_log, capture_* }
Px->>Mgmt: CheckLLMPolicyLimits(account, user, groups, provider, model)
Mgmt-->>Px: decision=allow|deny + selected_policy_id + attribution_group_id + window_seconds
Px->>Mgmt: RecordLLMUsage(account, user, group_id, group_ids, window_seconds, tokens, cost)
Px->>Mgmt: SendAccessLog(AccessLog{ agent_network=true })
```
The proto changes split into three independent slices: (1) **mapping enrichment**`PathTargetOptions` grows fields 8-13 so management can ship middleware configs, capture limits, and the agent-network / log-suppression flags down to the proxy without a second RPC; (2) **two new request/response RPCs** (`CheckLLMPolicyLimits`, `RecordLLMUsage`) for per-LLM-request budget arbitration; (3) **observability tag**`AccessLog.agent_network` so management can route logs to the right surface.
The OpenAPI side is a thin CRUD surface — every resource (`Provider`, `Policy`, `Guardrail`, `BudgetRule`, `Settings`) follows the same `GET-list / POST / GET / PUT / DELETE` pattern, plus a read-only `/consumption` listing and a catalog endpoint. The `*Request` variants drop server-controlled fields (id, timestamps). `AgentNetworkBudgetRule` deliberately reuses `AgentNetworkPolicyLimits` to keep wire-shape parity with policies.
## Public contracts added
- gRPC RPCs (`proxy_service.proto:52-57`): `CheckLLMPolicyLimits(CheckLLMPolicyLimitsRequest) → CheckLLMPolicyLimitsResponse`, `RecordLLMUsage(RecordLLMUsageRequest) → RecordLLMUsageResponse`. Both unary; default `UnimplementedProxyServiceServer` returns `codes.Unimplemented` (`proxy_service_grpc.pb.go:283-289`).
- New messages (`proxy_service.proto:145-175,448-502`): `MiddlewareConfig`, `MiddlewareSlot` enum, `CheckLLMPolicyLimitsRequest`/`Response`, `RecordLLMUsageRequest`/`Response`.
- New `PathTargetOptions` fields 8-13 (`proxy_service.proto:124-140`): `capture_max_request_bytes`, `capture_max_response_bytes`, `capture_content_types`, `middlewares`, `agent_network`, `disable_access_log`. All default-false / zero; pre-existing fields 1-7 byte-for-byte unchanged.
- `AccessLog.agent_network = 18` (`proxy_service.proto:258-261`).
- `RecordLLMUsageRequest.group_ids = 8` (`proxy_service.proto:496-498`) — so the record path can fan out to every applicable budget rule's window without a re-lookup.
- 15 new OpenAPI component schemas (`openapi.yml:5072-5829`): `AgentNetworkProvider[Request|Model]`, `AgentNetworkCatalog{Model,Provider,IdentityInjection,HeaderPairInjection,JSONMetadataInjection,ExtraHeader}`, `AgentNetworkPolicy[Request|TokenLimit|BudgetLimit|Limits]`, `AgentNetworkGuardrail[Checks|Request]`, `AgentNetworkConsumption`, `AgentNetworkSettings[Request]`, `AgentNetworkBudgetRule[Request]`.
- 9 new path groups (`openapi.yml:12797-13460`): `/api/agent-network/{consumption,settings,budget-rules,budget-rules/{ruleId},catalog/providers,providers,providers/{providerId},policies,policies/{policyId},guardrails,guardrails/{guardrailId}}`.
- Four typed NotFound errors (`shared/management/status/error.go:208-227`).
## Invariants
- **Field-number monotonicity.** Every new proto field uses a previously-unallocated number in its message: `PathTargetOptions` 8-13 (was 1-7), `AccessLog` 18 (was 1-17), `RecordLLMUsageRequest` 8. `SendStatusUpdateRequest.inbound_listener = 50` (pre-existing) reserves 50+ for observability extensions, so 8 on `RecordLLMUsageRequest` doesn't conflict.
- **Old proxies stay compatible.** Old management never sends `disable_access_log`/`middlewares`/`agent_network` (zero value → existing behaviour); old proxies that don't decode these fields just drop them silently (proto3 unknown-field semantics) — log emission stays on. No pre-existing field number changed: the proto change is insertions only.
- **Old management stays compatible.** The two new RPCs are registered on the same `management.ProxyService` descriptor; old proxies hitting them get `codes.Unimplemented` from the unimplemented embed (`proxy_service_grpc.pb.go:283-289`), which is the same fallback pattern `SyncMappings` already documents (`proxy_service.proto:20-21`).
- **OpenAPI shapes are append-only.** New schemas are placed at the end of `components.schemas` (line 5072+); new paths at the end of `paths` (line 12797+). No existing schema's `required` list, enum, or property type was changed.
- **`*Request` vs response asymmetry.** Read shapes (`AgentNetworkProvider`, `AgentNetworkPolicy`, `AgentNetworkGuardrail`, `AgentNetworkSettings`, `AgentNetworkBudgetRule`) require `created_at`/`updated_at`; the matching `*Request` shapes do not — server fills them. `AgentNetworkProviderRequest.api_key` is write-only (`openapi.yml:5158-5161` "never returned in responses"); reviewers should confirm the response schema (5072-5138) actually omits `api_key`.
## Things to scrutinize
### Correctness
- `RecordLLMUsageRequest` carries both `group_id` (singular, the attribution group — field 3) and `group_ids` (plural, full membership — field 8). `b22d5a181` adds field 8 to drive account-budget fan-out; double-check that consumers can't accidentally key counters on the wrong one. Field comments at `proxy_service.proto:489-491` and `496-498` distinguish them but it's the kind of subtle thing a follow-up commit might collapse.
- `PathTargetOptions.disable_access_log` is the only field whose default-false meaning **changes semantics** on the proxy side: false → log (status quo), true → suppress. Synthesizer sets `DisableAccessLog = !settings.EnableLogCollection`, so a missing/default settings row yields `EnableLogCollection=false → DisableAccessLog=true → suppressed`. Worth confirming downstream (`agentnetwork.synthesizer`) that operator-defined private services never inherit this flag — the proto field default protects them, but only if synth code is explicit.
- `CheckLLMPolicyLimitsResponse.decision` is a free-form `string` (`proxy_service.proto:471`) rather than an enum. Only documented values are "allow" / "deny". An enum would prevent typo drift; consider before this RPC ships to external consumers.
- `deny_code` (`proxy_service.proto:478-481`) is documented as "a stable label" but is also a free string. Pin the allowed set somewhere observable to the proxy.
### Security
- `AgentNetworkProvider.api_key` MUST be write-only. Schema split (request has it at line 5158; response omits it) looks correct, but a regression here leaks the upstream provider credential to every dashboard reader. Check that the handler explicitly zeros it on the response path.
- `extra_values` / `identity_header_*` headers on `AgentNetworkProvider` get stamped onto upstream requests. Description at `openapi.yml:5099` says "values not declared by the catalog are ignored at synth time" — a contract this module documents but the synthesizer must enforce. Confirm the synth module honours it.
- Cluster + subdomain on `AgentNetworkSettings` are documented immutable (`openapi.yml:5686-5694`) and the `AgentNetworkSettingsRequest` (lines 5733-5752) doesn't accept them. Verify the `PUT /api/agent-network/settings` handler can't be tricked by extra JSON keys (oapi-codegen's `additionalProperties: false` is not declared here; spec defaults to permissive).
### Backward compatibility
- The proto change is field-number additive: every previously numbered field keeps the same name + type, and the change is insertions only (no deletions in `proxy_service.proto`), so this holds at the source-text level.
- `proxy_service_grpc.pb.go` adds two RPC handlers and registers them in `ProxyService_ServiceDesc.Methods` (lines 543-552). The existing entries are unchanged and order-preserving — gRPC method dispatch is name-keyed, so order doesn't matter, but reviewing the diff (no method renamed/dropped) is still worth a glance.
- OpenAPI 3.0 doesn't have a built-in deprecation flow for paths; if any client tooling iterates `paths.*`, the additive routes shouldn't break it, but generated SDKs (especially the dashboard's) need a regen to gain access to `AgentNetwork*`.
### Codegen pinning
- `generate.sh` (`shared/management/http/api/generate.sh:14`) installs `oapi-codegen@latest` rather than a pinned version. **This is a reproducibility gap** — re-running the script later may produce a different `types.gen.go`. Either pin the version in `generate.sh` (e.g. `@v2.7.0`) or document the pin in a `tools.go`.
- proto codegen has the protoc / protoc-gen-go version stamped in the generated file header (`proxy_service.pb.go:3-4`).
- Regenerate locally and confirm zero diff against the committed `types.gen.go` / `proxy_service.pb.go`.
## Test coverage
| Test file | Locks down |
| --------- | ---------- |
| None in this scope | The proto and OpenAPI sources are tested transitively by the handler tests (`shared/management/http/handlers/agentnetwork/...`) and by the synthesizer/manager tests (`management/server/agentnetwork/...`). No round-trip serialisation test exists in the `proto/` or `api/` packages themselves. |
| `shared/management/proto/*_test.go` | (absent) |
| `shared/management/http/api/*_test.go` | (absent) |
Acceptable for codegen artefacts, but a single golden-file test that re-runs `oapi-codegen` and `protoc` in CI and diffs against the checked-in files would close the reproducibility gap noted above.
## Known limitations / explicit non-goals
- **No deprecation surface.** Old fields/RPCs are kept silently; there is no `[deprecated = true]` annotation on anything. Acceptable here because nothing is being removed.
- **No proto-side validation.** Numeric ranges (e.g. `window_seconds >= 60`, `cost_usd >= 0`, capture-byte clamps) are enforced in the OpenAPI schema via `minimum:` and inside Go code by the proxy/management, but `proto3` itself can't express them; downstream is expected to validate every message.
- **`MiddlewareConfig.config_json` is `bytes`** (`proxy_service.proto:163`) — opaque to the proto layer. Schema validity is the middleware factory's problem. This is a deliberate tradeoff (per the comment at 161-162) but worth flagging: a corrupted/malicious config_json can only fail at proxy apply time, not at the wire-decode step.
- **No catalog endpoint schema for the catalog itself** — the catalog data ships as a `GET /api/agent-network/catalog/providers` returning `[AgentNetworkCatalogProvider]` (`openapi.yml:13024`), but the catalog source-of-truth lives in `management/server/agentnetwork/catalog`, not here.
- The reaper / GC design was cut from scope; no reaper-related types appear here.
## Cross-references
- Downstream: [management/store](20-management-store.md), [management/agentnetwork](21-management-agentnetwork.md), [management/handlers + wiring](22-management-handlers-wiring.md), [proxy/runtime](33-proxy-runtime.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)

View File

@@ -1,112 +0,0 @@
# management/store — persistence for agent-network entities
> **Risk level:** Medium — six brand-new tables behind AutoMigrate, one upsert-counter table that runs on the request hot path, and one column carrying an encrypted secret.
> **Backward-compat impact:** Additive (six new tables created by AutoMigrate; the `Store` interface gains 23 methods, but no existing column/index is touched).
## Module boundary
This module is the persistence layer for the Agent Network feature. Everything the management server stores about LLM proxying — providers, policies, guardrails, the per-account settings row, a usage-counter table written on every proxied LLM request, and the account-budget rules — flows through the methods added to `store.Store`. The module owns six tables, six entity types from `management/server/agentnetwork/types`, and a single hot-path upsert (`IncrementAgentNetworkConsumption`) consumed by the proxy fleet.
Out of scope here: the catalog of provider definitions (compiled-in, no DB), the synthesizer/manager built on top of these CRUDs (covered in [21-management-agentnetwork.md](21-management-agentnetwork.md)), and the HTTP handlers that translate API requests into Save/Delete calls.
## Files
| Path | Role |
| ---- | ---- |
| `management/server/store/sql_store_agentnetwork.go` | gorm implementations of all 23 store methods |
| `management/server/store/sql_store_agentnetwork_budgetrule_test.go` | round-trip + account-scoping coverage against a real sqlite store |
| `management/server/store/sql_store.go` | one import, six entities appended to the `AutoMigrate` slice (sql_store.go:40, sql_store.go:141-142) |
| `management/server/store/store.go` | 23 methods added to the `Store` interface (store.go:328-354) |
| `management/server/store/store_mock_agentnetwork.go` | mockgen output for the new interface surface |
## Tables added / migrations
All six tables are created by `db.AutoMigrate` invoked from `NewSqlStore` at sql_store.go:133-143. There is no hand-rolled SQL migration script — the schema is whatever GORM derives from the struct tags.
- `agent_network_providers``Provider.TableName()` at provider.go:76. PK `id`, index on `account_id`, named index `idx_agent_network_provider` on `provider_id`. Carries an at-rest-encrypted `api_key` and ed25519 `session_private_key` (provider.go:35,56). `extra_values` and `models` are JSON blobs (`serializer:json`).
- `agent_network_policies``Policy.TableName()` at policy.go:70. PK `id`, index on `account_id`. JSON columns: `source_groups`, `destination_provider_ids`, `guardrail_ids`, `limits`.
- `agent_network_guardrails``Guardrail.TableName()` at guardrail.go:41. PK `id`, index on `account_id`. JSON `checks`.
- `agent_network_settings``Settings.TableName()` at settings.go:33. PK `account_id` (one row per account), named index `idx_agent_network_settings_cluster_subdomain` on `subdomain` only — the index name implies a composite, but only one column is tagged.
- `agent_network_consumption``Consumption.TableName()` at consumption.go:46. Composite PK across `(account_id, dim_kind, dim_id, window_seconds, window_start_utc)` — the same tuple the upsert keys on.
- `agent_network_budget_rules``AccountBudgetRule.TableName()` at budgetrule.go:35. PK `id`, index on `account_id`. JSON `target_groups`, `target_users`, `limits`.
## CRUD surface added
Provider, Policy, Guardrail, BudgetRule follow the same pattern: `Get<Kind>ByID`, `GetAccount<Kind>` (list), `Save<Kind>` (upsert), `Delete<Kind>`, with account-scoping enforced by the existing `accountAndIDQueryCondition` / `accountIDCondition` constants (sql_store.go:59-62). Provider additionally exposes `GetAllAgentNetworkProviders` (cross-account, used by the synthesizer). Settings exposes `Get`/`GetByCluster`/`Save` (no delete — one row per account, created on first save). Consumption exposes the upsert `Increment`, a point `Get`, and a cross-window `List`.
## Architecture & flow
```mermaid
flowchart LR
handlers["HTTP handlers<br/>(management/server/agentnetwork)"] -->|Save/Delete| iface["Store interface<br/>store.go:328-354"]
manager["agentnetwork.Manager"] -->|Get*| iface
synth["synthesizer<br/>(global)"] -->|GetAllAgentNetworkProviders| iface
proxy["proxy fleet<br/>(hot path)"] -->|IncrementAgentNetworkConsumption| iface
iface --> sql["SqlStore methods<br/>sql_store_agentnetwork.go"]
iface -.gomock.-> mock["MockStore<br/>store_mock_agentnetwork.go"]
sql --> gorm["gorm.DB"]
gorm --> tables[("6 tables<br/>agent_network_*")]
sql --> enc["crypt.FieldEncrypt<br/>(provider only)"]
```
Reads decrypt provider secrets in-place; writes do `provider.Copy().EncryptSensitiveData(...)` before `db.Save` so the caller's in-memory object keeps the plaintext `api_key` (sql_store_agentnetwork.go:88-102). Every list/get takes a `LockingStrength` and applies `clause.Locking{Strength: ...}` when non-`None` — matching the rest of the store. The upsert path uses `clause.OnConflict` with `gorm.Expr` server-side increments so concurrent proxy nodes converge without read-modify-write races (sql_store_agentnetwork.go:321-335).
## Invariants enforced at the store layer
- **Account scoping.** Every entity-by-ID method keys on `account_id = ? and id = ?`; no cross-tenant leak path through the API is reachable as long as callers always pass the auth'd `accountID` (sql_store_agentnetwork.go:70,141,201,429).
- **NotFound mapping.** `gorm.ErrRecordNotFound` is translated to typed `status.NewAgentNetwork*NotFoundError`; `Delete*` returns NotFound when `RowsAffected == 0` (sql_store_agentnetwork.go:111-113,171-173,231-233,461-463).
- **Provider secret encryption at rest.** `SaveAgentNetworkProvider` always encrypts before persist; `Get*` always decrypts after read. The plaintext `api_key` never reaches the DB through this layer (sql_store_agentnetwork.go:31,54,80,90).
- **Consumption monotonicity.** The upsert only ever issues `col = col + ?` for the three counter columns — no decrement path exists (sql_store_agentnetwork.go:330-332).
- **Window alignment is the caller's responsibility.** The store stamps `WindowStartUTC` as-passed; alignment to epoch happens in `types.WindowStart` at consumption.go:51-58.
- **Settings has no Delete.** Intentional — one row per account, created on first save; the row sticks around for the account lifetime.
## Things to scrutinize
### Correctness
- `SaveAgentNetworkProvider` saves the copy (sql_store_agentnetwork.go:95). The caller's in-memory pointer therefore keeps plaintext `api_key` and any `CreatedAt`/`UpdatedAt` gorm autofills land on the copy, not the original. Callers that need synced timestamps must re-fetch.
- `IncrementAgentNetworkConsumption`'s `Create` provides initial counter values (`TokensInput: tokensIn`, etc.) in the row, and on conflict the assignments add the same deltas to the existing values. The insert-vs-update arithmetic is consistent. Cross-check that no engine in use (sqlite, postgres, mysql) silently rejects the `OnConflict` clause — GORM emits engine-specific SQL but `ON DUPLICATE KEY UPDATE` (mysql) vs `ON CONFLICT (...)` (sqlite/postgres) need their unique constraint to match the composite PK on `agent_network_consumption`; it does, by construction.
- `IncrementAgentNetworkConsumption` writes `updated_at: time.Now().UTC()` literally inside the assignments map (sql_store_agentnetwork.go:333) — fine, but it's a Go-side timestamp captured at call time, not a DB-side `now()`. Acceptable for an audit field.
- `GetAgentNetworkConsumption` returns a zero-valued non-nil row on `ErrRecordNotFound` (sql_store_agentnetwork.go:364-371). Document or rename — a typed sentinel error would be more orthodox; callers must know not to error-check.
### Concurrency / transactions
- Hot-path `IncrementAgentNetworkConsumption` runs outside any explicit transaction; concurrency safety relies entirely on the DB serialising the `ON CONFLICT` upsert against the composite PK. This is correct for postgres and mysql; for sqlite it serialises behind the single writer.
- `SaveAgentNetworkSettings` is a blind upsert with no version/etag — concurrent writes from two operators last-write-wins on the collection-toggle flags (settings.go:23-25). Acceptable for admin-curated state but worth flagging.
- `Save*Provider` uses `db.Save` on a struct with a PK already set — GORM emits UPDATE or INSERT based on row existence. No upsert clause is attached, so a race between two creates with the same generated `xid` (vanishingly unlikely) would surface as a PK violation.
### Migration safety
- All six tables ride `AutoMigrate` (sql_store.go:141-142). AutoMigrate is additive: new columns get added, but it never drops columns nor narrows types. Three `bool` columns on `agent_network_settings` (`EnableLogCollection`, `EnablePromptCollection`, `RedactPii`) default to false at the GORM/DDL layer for existing rows; the test at sql_store_agentnetwork_budgetrule_test.go:83-112 locks that down on a fresh sqlite. Verify postgres/mysql produce the same default.
- The named index `idx_agent_network_settings_cluster_subdomain` on settings.go:15 is declared on only `subdomain`. Either the cluster column also needs `gorm:"index:idx_agent_network_settings_cluster_subdomain"` to make it composite, or the name is misleading.
- The named index `idx_agent_network_provider` on `Provider.ProviderID` (provider.go:30) is *not* unique and not scoped to account — two providers in the same account with the same `provider_id` are permitted at the DB layer; uniqueness, if any, must live above the store.
### Backward compatibility
- Net additive. No removed methods, no renamed columns, no schema change to existing tables. Existing deployments running a prior binary continue to work; the first boot of the new binary creates the six tables.
- The `Store` interface grows by 23 methods (store.go:330-354); any non-mock external implementer of `store.Store` will fail to compile. The repo only has `SqlStore` + `MockStore`, both updated.
### Performance (indexes, N+1)
- All by-account list queries hit the `idx_account_id` per-table index. No N+1: list methods return the full slice in one query.
- `GetAgentNetworkSettingsByCluster` (sql_store_agentnetwork.go:263-277) does a tablescan on `cluster` — no index. Tolerable for the bootstrap label generator (one-shot at provisioning) but worth noting if the call moves onto a hot path.
- `ListAgentNetworkConsumption` returns every row ever recorded for the account (sql_store_agentnetwork.go:382-400) — unbounded growth, no `LIMIT`, no time filter. With one row per (dim, window) per request burst, this table grows fastest of the six; a retention job + a paginated list method are obvious follow-ups.
## Test coverage
| Test file | Locks down |
| --------- | ---------- |
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_RoundTrip` | full save → reload of `AccountBudgetRule` including the JSON-serialised `PolicyLimits`, target slices, double-delete returns NotFound (lines 18-59) |
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_ScopedByAccount` | cross-account isolation for budget rules (lines 63-78) |
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkSettings_RealStore_CollectionTogglesRoundTrip` | collection toggles default off, survive save/reload at the set values (lines 83-112) |
Gap: there is no store-level test for providers (encryption round-trip), policies, guardrails, or `IncrementAgentNetworkConsumption` (concurrent upsert, window-key uniqueness). The consumption upsert is the most performance-sensitive method in this module and the only one without a real-sqlite test.
## Known limitations / explicit non-goals
- No retention / GC for `agent_network_consumption`.
- No `Delete` for `Settings` (one row per account, cleared with the account).
- No DB-engine-specific tuning — the same struct tags drive sqlite, mysql, postgres.
- Provider `extra_values` and `models` are JSON blobs; querying inside them is not supported by design.
- `GetAgentNetworkConsumption` "not-found = zero row" contract is convenient but unconventional.
## Cross-references
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)

View File

@@ -1,225 +0,0 @@
# management/agentnetwork — domain layer + synth pipeline
> **Risk level:** High — central business logic + budget enforcement + the source of every middleware-chain change the proxy executes.
> **Backward-compat impact:** Additive within the agent-network surface; one **behavioural difference for opted-out accounts** in parser capture (the capture flag is stamped explicitly false instead of being absent — see capture-pointer semantics below). Non-agent-network proxy services are untouched (the synth chain only ships on `agent-net-svc-*` targets).
## Module boundary
`management/server/agentnetwork` owns every agent-network entity (providers, policies, guardrails, account budget rules, per-account settings, consumption rows) and **translates them into the in-memory `*rpservice.Service` that the reverse-proxy controller turns into `proto.ProxyMapping`s and pushes to clusters**. It is the *only* writer of the agent-network middleware chain.
Inside the package: `manager.go` is the CRUD + permissions-gated facade; `synthesizer.go` walks settings + providers + policies + guardrails and emits the per-account service plus every middleware's JSON config; `policyselect.go` runs per-request attribution (min-wins account ceiling, then "drain bigger pool first"); `reconcile.go` diffs successive synth outputs and emits precise Create/Update/Delete proxy-mapping updates plus a peer-map refresh. `labelgen/` mints DNS-safe subdomain labels; `catalog/` is the static provider catalogue; `types/` carries gorm entity structs. The `_realstack_test.go` files in the parent `management/server/` directory exercise the manager + network-map controller end-to-end with no mocks.
## Files
| Path | Role |
| ---- | ---- |
| `agentnetwork/manager.go` | Manager interface + CRUD + permission gates + bootstrap-settings + reconcile trigger |
| `agentnetwork/synthesizer.go` | Settings/policy → wire-format synthesis; sole writer of the proxy middleware chain |
| `agentnetwork/policyselect.go` | Per-request policy attribution + account-budget ceiling (min-wins) |
| `agentnetwork/reconcile.go` | Per-account synth diff vs in-memory cache → Create/Update/Delete |
| `agentnetwork/catalog/catalog.go` | Static provider catalogue (auth headers, identity-injection shapes) |
| `agentnetwork/labelgen/{labelgen,words}.go` | DNS-safe subdomain picker + curated wordlist |
| `agentnetwork/types/provider.go` | Provider entity + APIKey + Models + ExtraValues + SessionKeys |
| `agentnetwork/types/policy.go` | Policy entity + `PolicyLimits` (token + budget) |
| `agentnetwork/types/guardrail.go` | Guardrail entity (`ModelAllowlist`, `PromptCapture`) |
| `agentnetwork/types/budgetrule.go` | `AccountBudgetRule` (reuses `PolicyLimits`) |
| `agentnetwork/types/settings.go` | Per-account `Settings` (Cluster, Subdomain, 3 toggles) |
| `agentnetwork/types/consumption.go` | `Consumption` row + `WindowStart` aligner |
| `agentnetwork/{synthesizer,policyselect,reconcile,wire_shape}_*test.go` | See test coverage table |
| `agentnetwork/types/consumption_test.go` | `WindowStart` alignment proofs |
| `agentnetwork/labelgen/labelgen_test.go` | Deterministic picks + exhaustion + fallback |
| `management/server/agentnetwork_realstack_test.go` | No-mock provider CRUD → network-map fan-out |
| `management/server/agentnetwork_budgetrule_realstack_test.go` | No-mock budget-rule CRUD + settings preserve-immutable |
## Architecture & flow
### Synthesis (settings/policy → wire format)
```mermaid
flowchart TD
A[Mutation: provider/policy/guardrail/settings] --> B[managerImpl.reconcile accountID]
B --> C{proxyController nil?}
C -- yes --> D[accountManager.UpdateAccountPeers only]
C -- no --> E[SynthesizeServices]
E --> F[loadSettings — NotFound returns ok=false, no synth]
F --> G[filterEnabledProviders sorted by CreatedAt]
G --> H[filterEnabledPolicies]
H --> I[backfillProviderSessionKeys if missing]
I --> J[indexProviderGroups: providerID -> sorted source groups]
J --> K[buildRouterConfigJSON drops orphan providers]
J --> L[buildIdentityInjectConfigJSON per catalog entry]
H --> M[mergeGuardrails: union allowlist, OR redact]
M --> N[applyAccountCollectionControls account toggle = SOLE capture control]
N --> O[marshalGuardrailConfig]
K --> P[buildMiddlewareChain 8 middleware entries]
L --> P
O --> P
P --> Q[buildAccountService: AccessGroups=union source groups, noop.invalid target]
Q --> R[reconcile.diffMappings vs cache]
R --> S[SendServiceUpdateToCluster CREATE/MODIFY/REMOVE]
R --> T[accountManager.UpdateAccountPeers — fans synth ACLs into network map]
```
### Budget rule resolution (min-wins, group+user bound)
```mermaid
flowchart TD
A[SelectPolicyForRequest in] --> B[checkAccountBudget — runs FIRST, independent of policies]
B --> C[GetAccountAgentNetworkBudgetRules]
C --> D{for each enabled rule}
D --> E{budgetRuleApplies?}
E -- no --> D
E -- yes --> F[attrGroup = lowestIntersect TargetGroups, in.GroupIDs]
F --> G{Token cap enabled?}
G -- yes --> H[evalTokenCap user dim + group dim]
H --> I{exhausted?}
I -- yes --> J[DENY: llm_account.token_cap_exceeded - STOP]
I -- no --> K{Budget cap enabled?}
G -- no --> K
K -- yes --> L[evalBudgetCap user dim + group dim]
L --> M{exhausted?}
M -- yes --> N[DENY: llm_account.budget_cap_exceeded - STOP]
M -- no --> D
K -- no --> D
D --> O[All rules passed -> fall through to per-policy selection]
```
Key invariant: **rules are checked sequentially and ANY exhausted rule denies (all-must-pass / min-wins).** Untargeted rules (`len(TargetGroups)==0 && len(TargetUsers)==0`) apply to every caller (`policyselect.go:393`).
### Policy selection (per-peer, per-request)
```mermaid
flowchart TD
A[Account-budget gate passed] --> B[GetAccountAgentNetworkPolicies]
B --> C[filterApplicablePolicies enabled + provider match + group intersect]
C --> D{candidates empty?}
D -- yes --> E[Allow, empty SelectedPolicyID]
D -- no --> F[scoreCandidates -> scoreOne per policy]
F --> G[scoreOne: attrGroup + window]
G --> H{any cap exhausted?}
H -- yes --> I[Drop policy; record last deny code]
H -- no --> K[Keep as live candidate]
F --> L{live candidates exist?}
L -- no --> M[Deny with last exhaustion code]
L -- yes --> N[Sort: uncapped wins -> larger group token -> group budget -> user token -> user budget -> oldest CreatedAt]
N --> O[winner = scored 0]
O --> P[Allow + SelectedPolicyID + AttributionGroupID + WindowSeconds]
```
End-to-end: a mutation calls `managerImpl.reconcile(ctx, accountID)` (`manager.go:205,239,...`). Reconcile defers an `accountManager.UpdateAccountPeers` so the network-map controller re-runs and `injectAllProxyPolicies` picks up the new access groups; with a `proxyController` wired, it re-synthesizes the service, diffs against `reconcileCache[accountID]` (guarded by `reconcileMu`), and emits proto mappings to the cluster derived from the mapping's domain (`reconcile.go:120`). Synthesis is stateless and idempotent. Sole persistent side effect: `backfillProviderSessionKeys` (`synthesizer.go:249`) mints ed25519 keys on legacy provider rows and writes them back.
At request time the path is independent: the proxy calls `SelectPolicyForRequest` (`policyselect.go:56`); account-budget ceiling first, then per-policy scoring. Token + budget caps share `evalTokenCap` / `evalBudgetCap` — same primitive for account rules and policy limits, `label` differentiates the deny reason. After a served request, `RecordAccountBudgetUsage` (`policyselect.go:415`) fans deltas to every applicable rule's distinct `(dim_kind, dim_id, window)` tuple, deduplicating to prevent double-count when two rules share target+window.
## Public contracts
- **Manager interface** (`manager.go:48-80`): CRUD for `Providers/Policies/Guardrails/BudgetRules`; `GetSettings/UpdateSettings` (cluster + subdomain immutable, only the three toggles mutate); `ListConsumption/RecordConsumption(account, kind, dimID, windowSec, in, out, USD)`; `RecordAccountBudgetUsage(account, user, groups, in, out, USD)`; `SelectPolicyForRequest(ctx, PolicySelectionInput) → *PolicySelectionResult{Allow, SelectedPolicyID, AttributionGroupID, WindowSeconds, DenyCode, DenyReason}`.
- **`PolicySelectionInput`** (`manager.go:85-90`): `{AccountID, UserID, GroupIDs, ProviderID}` — populated by the proxy from CapturedData + `llm_router` resolution.
- **Synthesized middleware chain** (`synthesizer.go:576-657`), order load-bearing — response slot runs reverse-of-slice:
| Slot | Idx | ID | ConfigJSON shape | CanMutate |
| --- | --- | --- | --- | --- |
| on_request | 0 | `llm_request_parser` | `{"capture_prompt": <bool>, "redact_pii"?: true}` | |
| on_request | 1 | `llm_router` | `{"providers":[{id, models[], upstream_*, auth_header_*, allowed_group_ids[]}]}` | **true** |
| on_request | 2 | `llm_limit_check` | `{}` | |
| on_request | 3 | `llm_identity_inject` | `{"providers":[{provider_id, header_pair?, json_metadata?, extra_headers?}]}` | **true** |
| on_request | 4 | `llm_guardrail` | `{"model_allowlist"?, "prompt_capture":{enabled,redact_pii}}` | |
| on_response | 5 | `llm_limit_record` | `{}` (runs LAST at runtime) | |
| on_response | 6 | `cost_meter` | `{}` | |
| on_response | 7 | `llm_response_parser` | `{"capture_completion": <bool>, "redact_pii"?: true}` | |
- **Synthesized service shape** (`synthesizer.go:739`): `Mode=HTTP`, `Private=true`, `Domain=<subdomain>.<cluster>`, `AccessGroups=unionSourceGroups(enabledPolicies)`, one `TargetTypeCluster` target with `Host=noop.invalid:443` (router rewrites per request), `Options.{DirectUpstream,AgentNetwork}=true`, `DisableAccessLog=!settings.EnableLogCollection`, `CaptureMax{Req,Resp}Bytes=1<<20`, `CaptureContentTypes=["application/json","text/event-stream"]`.
## Invariants
- **Min-wins / all-must-pass for account budget rules** (`checkAccountBudget`, `policyselect.go:353`): every applicable enabled rule is checked; first exhausted cap denies. Untargeted rules bind every caller.
- **Account toggle is the SOLE control for capture enablement.** `applyAccountCollectionControls` (`synthesizer.go:701`) sets `merged.PromptCapture.Enabled = settings.EnablePromptCollection` *unconditionally*.
- **Capture-pointer semantics on parser configs** — see "Things to scrutinize" below.
- **`EnableLogCollection``DisableAccessLog` is the only access-log toggle** (`synthesizer.go:770`). Default off ⇒ access log suppressed.
- **`RedactPii` flows verbatim to BOTH parsers** (`synthesizer.go:584-585`) and is OR'd into the merged guardrail (`synthesizer.go:706`).
- **Cluster and Subdomain are immutable on Settings.** `UpdateSettings` reloads existing row and overlays only the three toggles (`manager.go:558-561`).
- **Orphan providers (no enabled policy authorises them) NEVER reach the router** (`synthesizer.go:351-357`); skipped from `identity_inject` for symmetry.
- **Provider creation refuses empty `api_key`** (`manager.go:175`); **deletion refuses while any policy still references it** (`manager.go:265-273`).
- **Session keypair stability across provider edits** (`manager.go:226-228`) — server-managed, copied through every `UpdateProvider`, never API-surfaced.
## Things to scrutinize
### Correctness
- **Capture-pointer semantics — `*bool` vs `bool`.** Three states, owned by separate sides:
- **Wire JSON this module emits:** `buildParserConfigJSON` (`synthesizer.go:678-693`) *always* stamps the capture field. Agent-network targets ship `"capture_prompt": false` or `"capture_prompt": true` — never absent. Same for `"capture_completion"`. The happy-path test pins `{"capture_prompt":false}` (`synthesizer_test.go:174`).
- **Proxy-side parser config (consumer):** parsers decode into `*bool`. Matrix:
- `nil` (field absent) → **legacy default = emit**. Preserved for non-agent-network callers and pre-existing tests (the backward-compat hook).
- `false` (field present, value false) → **suppress emission entirely**. The behaviour for opted-out agent-network accounts. Without this, `enable_log_collection=true` + `enable_prompt_collection=false` would leak raw user input AND raw model output to the access log.
- `true` → emit normally.
- **Why the synth always stamps a value:** an agent-network mapping omitting the field would hit legacy "always emit" and re-introduce the leak. The `json.Marshal` error fallback at `synthesizer.go:687` degrades to `{}` — comment-claimed unreachable, but if ever fired re-introduces the leak. Consider fail-closed (return literal `{"capture_prompt":false}`) instead.
- **`scoreCandidates` non-cumulative deny code.** Only the *last* exhausted policy's deny code survives (`policyselect.go:188-190`). Iteration order is store's natural order. Auth signal is `len(scored)==0`, so this is informational only — verify no UI depends on "first exhausted policy" semantics.
- **`effectiveWindowSeconds` token-wins tiebreak.** When both halves are enabled with different windows, token's window wins (`policyselect.go:482`). Verify `RecordLLMUsage` increments against the winning window only.
- **`RecordAccountBudgetUsage` dedup.** Two rules with the same `(kind, dim_id, window)` would double-count without the `tuples` map (`policyselect.go:434-449`). Key includes all three dimensions — correct.
- **Fail-closed on bad provider:** unknown catalog id (`synthesizer.go:794-796`) or empty API key (`synthesizer.go:801-803`) drops the **entire** account's synth, not just the bad provider. Confirm matches operator UX.
### Security
- **Redact OR-merge:** merged `RedactPii` = account OR guardrail (`synthesizer.go:706`). **Parser-side flag is `settings.RedactPii` only, NOT the OR** — a guardrail-only opt-in does not propagate to parsers. Correct because the account toggle gates capture, but worth noting on the proxy side.
- **Group resolution must not leak across accounts.** Every store call carries `accountID` (`policyselect.go:73, 286, 298, 322, 334, 354`); `lowestIntersect` uses caller's claimed groups only (`policyselect.go:494`). Risk surface is upstream (handler populates `in.GroupIDs`).
- **`UpdateSettings` preserves immutable Cluster + Subdomain** (`manager.go:558`). A client can't rebind the cluster.
- **Provider session keypair backfill writes through `SaveAgentNetworkProvider`** (`synthesizer.go:256`) from a read-shaped call. Idempotent → worst case is a wasted write under concurrent reconcile + snapshot.
### Concurrency
- **`reconcileMu`** guards `reconcileCache`. Lock window is narrow — compute diff inside, send outside (`reconcile.go:56-68`).
- **`labelRngMu`** guards `labelRng` because `math/rand.Source` is unsafe for concurrent use (`manager.go:638-640`).
- **Real-store tests** use `store.NewTestStoreFromSQL` with `t.TempDir()` per test — no shared state, no `t.Parallel()`.
- **`RecordAccountBudgetUsage` dedup `tuples` map is per-call;** concurrent calls fan out fully — correct (each request's tokens book once per applicable rule).
- **Deferred `UpdateAccountPeers` runs inline after the proxy push** (`reconcile.go:28-35`); a slow call stretches CRUD response time.
### Backward compatibility
- **Capture-pointer semantics (restated):** non-agent-network callers see no field → legacy nil-default emit, identical to pre-PR. Agent-network targets always carry an explicit `capture_*` value.
- **`TestSynthesizeServices_HappyPath` was updated:** request-parser config moved from `{}` to `{"capture_prompt":false}` (`synthesizer_test.go:174`). External snapshot tests against synth output need updating.
- **`MergedGuardrails` retains zeroed `TokenLimits`/`Budget`/`Retention`** even though `Policy.Limits` carries the real values now; `llm_limit_check` is the authoritative enforcement. Comment at `synthesizer.go:940-948` calls this out.
### Performance
- **`SynthesizeServices` runs on every controller tick / mutation reconcile.** Cost: 4 store reads + optional per-provider keypair backfill. Sort + index + merge are O(N log N) / O(P × G); dominant cost is JSON marshalling. No nested loops escape these dimensions.
- **`reconcile.diffMappings` is O(N + M)** with N=M=1 per account today — effectively constant.
- **`SynthesizeServicesForCluster`** (`synthesizer.go:71`) walks every account on a cluster; per-account failures are **swallowed** (`synthesizer.go:91-93`) so a single misconfigured account doesn't drop the cluster. Runs per proxy reconnect.
### Observability
- **Activity codes:** `AgentNetwork{Provider,Policy,Guardrail,BudgetRule}{Created,Updated,Deleted}`; `AgentNetworkSettingsUpdated` with `log_collection/prompt_collection/redact_pii` payload (`manager.go:567-571`). **No activity code for `SelectPolicyForRequest` denies** — surfaced via proxy access log only (likely intentional given volume).
- **Deny codes** namespaced: `llm_policy.{token,budget}_cap_exceeded`, `llm_account.{token,budget}_cap_exceeded` (`policyselect.go:18-26`).
- **Reconcile failures are logged at warn and swallowed** (`reconcile.go:42-44`). Persistent synth failures (e.g. unknown catalog id) silently keep the proxy out of sync — consider a manager-level synth-health surface if this becomes a support burden.
## Test coverage
| Test file | Locks down |
| --------- | ---------- |
| `synthesizer_test.go` | Mock-store: `HappyPath` (8-mw chain ordering, `{"capture_prompt":false}` baseline); `No{Settings,Providers}`; `Disabled{Provider,Policy}_NoService`; `RouterConfigOrdering`; `PolicyCheckConfig_UnionsSourceGroups`; `OrphanProvider_HasEmptyAllowedGroups`; identity-inject for LiteLLM / Bifrost (overrides + partial disable) / Cloudflare / Portkey / Vercel / OpenRouter / generic non-customizable; `GuardrailMerge_AllowlistUnion_LimitsRestrictive`; `BackfillsMissingSessionKeys`; `HTTPUpstream_KeepsExplicitPort`; `UpstreamURLPath_FlowsToRouter`; `UnknownProviderID_FailsClosed`; `EmptyAPIKey_FailsClosed`. |
| `synthesizer_realstore_test.go` | Real-sqlite: `SurvivesStatusToggle` reproduces the disable/re-enable 403 regression; `Reconcile_RealStore_PushesPrivateAfterStatusToggle` extends through reconcile push. |
| `synthesizer_guardrail_realstore_test.go` | `PromptCaptureAccountIsSoleControl`; `PromptCaptureFlowsWhenAccountOptsIn`; `AccountRedactWithoutGuardrailRedact`; `NoGuardrail_CaptureOff`. |
| `synthesizer_log_collection_realstore_test.go` | `LogCollection{Off_SuppressesAccessLog,On_PermitsAccessLog}` — verifies `DisableAccessLog` propagation through `ToProtoMapping`. |
| `synthesizer_parser_redact_realstore_test.go` | **Capture-pointer regression suite:** `ParserConfigsCarryRedactPii`; `ParserConfigsSuppressCaptureWhenLogCollectionOnly` (log=on/prompt=off ⇒ both capture flags false); `ParserConfigsOmitRedactPiiWhenOff`. |
| `policyselect_test.go` | Mock-store: `NoApplicablePolicies`; `AllowWithLowestGroupAttribution`; `LargerPoolWinsAcrossUsageLevels`; `StaysOnLargerPoolAfterPartialDrain`; `FallsThroughToSmallerPoolWhenLargerExhausted`; `TiebreakBy{LargerGroupPool,CreatedAt}`; `DeniesWhenAllExhausted`; `UncappedPolicyAlwaysWinsAgainstCapped`; `DisabledPolicyIgnored`; `StoreErrorPropagates`; `RejectsEmptyAccount`; `SharesGroupCounterAcrossPolicies`; `AntiFallThroughOnLowestGroup`; `BudgetOnlyExhaustionDenies`; `BudgetTighterThanTokenWins`. |
| `policyselect_realstore_test.go` | Real-sqlite regression guard: `NoApplicablePolicies`; `AllowAndLowestGroupAttribution`; `LargerPoolWins_FallsThroughWhenExhausted`; `BudgetCapDenies`; `GroupCounterSharedAcrossPolicies`; `DisabledPolicyIgnored`. |
| `policyselect_account_realstore_test.go` | Account budget rules: `AccountCeilingBindsEvenWithUncappedPolicy` (min-wins); `AccountGroupCeiling`; `AccountTargetUsersBindsOnlyThatUser`; `AccountRuleRecordsToOwnWindow`. |
| `reconcile_test.go` | `FirstSynth_EmitsCreate`; `NoChange_EmitsNothingExtra` (re-push as Modified — verify desired); `PolicyRemoved_EmitsDelete`; `NilProxyController_NoOp`; `EmptyAccountID_NoOp`; `ClusterFromMapping`. |
| `wire_shape_test.go` | `TestSynthesizedService_WireShape` — proto-shape lockdown via `ToProtoMapping`. Catches "service not matching" (mapping reaches proxy but no SNI/HTTP route). Asserts ID, Domain, Mode, AuthToken, `Private`, `Auth.Oidc=false`, one path `/` + `https://noop.invalid/`, 8 middlewares with correct slot enums, router config `auth_header_value="Bearer sk-test-key"`. |
| `labelgen/labelgen_test.go` | `PickUnique_{DeterministicWithSeededRng,AvoidsTakenWordsWhenMostAreReserved,FallsBackWhenAllReserved}`; `UniqueWords_DropsDuplicates`. |
| `types/consumption_test.go` | `WindowStart_{AlignedToUnixEpoch,WithinWindowConverges,AcrossWindowsDiverges,DifferentWindowsHaveDifferentBuckets,SubMinuteAndMinuteAlignment,ZeroWindowReturnsInputUTC}`. Bucket alignment so multi-node reads converge. |
| `agentnetwork_realstack_test.go` | `ProviderCRUD_FansOutToProxyAndClientPeers` — no-mock end-to-end through real account manager + network-map + agentnetwork: provider create propagates the updated map to both proxy peer and client peer with the synth DNS surface. |
| `agentnetwork_budgetrule_realstack_test.go` | `BudgetRuleCRUD_RealManager`; `UpdateSettings_PreservesImmutableAndTogglesCollection`. |
## Known limitations / explicit non-goals
- **`MergedGuardrails.TokenLimits/Budget/Retention` emit at zero** (`synthesizer.go:940-948`); real enforcement is `Policy.Limits` via `llm_limit_check`. Future cleanup implied.
- **Session keys picked from first enabled provider by created_at** (`pickServiceSessionKeys`, `synthesizer.go:270`). Existing session cookies survive provider edits only while the first-by-CreatedAt provider stays in place. Document for operators.
- **Reconcile failures silently swallowed** (`reconcile.go:42-44`). Persistent failures keep the proxy out of sync until the next reconcile.
- **`scoreCandidates` exposes only the LAST exhaustion's deny code** when multiple policies are exhausted.
- **`bootstrapSettingsIfNeeded` failure is non-fatal to provider create** (`manager.go:200`): provider lands, synth is no-op until the next provider create retries the bootstrap.
- **Budget rules do not trigger a reconcile** (`manager.go:476-477`). Request-time evaluation only; new rules take effect on the next request without a proxy push.
## Cross-references
- **Upstream:** [shared/api](10-shared-api.md), [management/store](20-management-store.md), reverseproxy `service`/`proxy`/`sessionkey` packages, `management/server/permissions` + `activity`.
- **Downstream:** [management/handlers (HTTP wiring)](22-management-handlers-wiring.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), network-map controller (`injectAllProxyPolicies` fan-out).
- **End-to-end flow:** [../01-end-to-end-flows.md](../01-end-to-end-flows.md) — "Provider create → reconcile → proxy push → peer map refresh" and "request → policy select → record" diagrams.
- **Top-level:** [../00-overview.md](../00-overview.md)

View File

@@ -1,203 +0,0 @@
# management/handlers + wiring — HTTP API + gRPC delivery
> **Risk level:** Medium — the surface is mostly additive, but two changes are load-bearing: `injectAllProxyPolicies` runs on every per-peer compute, and `shallowCloneMapping` must round-trip `Private` (a missed field silently breaks every MODIFIED).
> **Backward-compat impact:** Additive on the wire (new routes, new RPCs, new proto fields, new gorm column on `AccessLogEntry`). One management-internal break: `nbhttp.NewAPIHandler` gains a trailing `agentNetworkManager` parameter; `nil` is tolerated and silently skips route registration.
## Module boundary
This module is the seam between the public Agent Network HTTP API and the proxy fleet that serves agent traffic. North side: a `/api/agent-network/*` surface (providers, policies, guardrails, budget rules, settings, consumption) on the existing gorilla router, delegating to `agentnetwork.Manager`. Handlers are thin — they translate `api.*``types.*`, validate shape, forward. RBAC and event emission stay inside the manager (`manager.go:680-682`).
South side: `ProxyServiceServer` (`proxy.go`) learns to (a) ship synth services to a proxy on initial snapshot, (b) resolve agent-network domains in `getServiceByDomain` for OIDC/session/tunnel-peer flows, (c) gate LLM requests via `CheckLLMPolicyLimits` + `RecordLLMUsage`, (d) preserve `Private` through `shallowCloneMapping` so per-proxy live updates don't silently flip services public. The network_map controller prepends synth services to `account.Services` on every per-peer compute; `accesslogentry.go` gains an indexed `AgentNetwork` column so the dashboard can filter cheaply.
## Files
| Path | Role |
| ---- | ---- |
| `handlers/agentnetwork/providers_handler.go` | Catalog + provider CRUD + central `AddEndpoints` |
| `handlers/agentnetwork/policies_handler.go` | Policy CRUD + shared `validatePolicy*` |
| `handlers/agentnetwork/guardrails_handler.go` | Guardrail CRUD |
| `handlers/agentnetwork/budget_handler.go` | Account-level budget rule CRUD |
| `handlers/agentnetwork/settings_handler.go` | GET (200+`null` if unbootstrapped) + PUT toggles |
| `handlers/agentnetwork/consumption_handler.go` | Read-only consumption rows |
| `handlers/agentnetwork/handlers_test.go` | Real-store fixture; wire round-trip + validation |
| `handlers/agentnetwork/budget_handler_test.go` | Budget-rule + settings toggles |
| `server/http/handler.go` | New `agentNetworkManager` arg; conditional `AddEndpoints` |
| `server/permissions/modules/module.go` | New `AgentNetwork` module key |
| `internals/server/boot.go` | Wires synthesiser adapter + limits service into proxy server |
| `internals/server/modules.go` | `AgentNetworkManager()` lazy-create node |
| `internals/controllers/network_map/controller/controller.go` | `injectAllProxyPolicies` replaces 4 `InjectProxyPolicies` calls |
| `internals/controllers/network_map/controller/repository.go` | `SynthesizeAgentNetworkServices` repo method |
| `internals/modules/reverseproxy/service/service.go` | `MiddlewareConfig`, capture limits, `AgentNetwork`, `DisableAccessLog` + proto |
| `internals/modules/reverseproxy/accesslogs/accesslogentry.go` | Indexed `AgentNetwork bool` from proto |
| `internals/shared/grpc/proxy.go` | Synth wiring, 2 RPCs, domain fallback, `Private` in clone |
| `internals/shared/grpc/proxy_clone_test.go` | Locks every `ProxyMapping` field minus `AuthToken` |
| `server/activity/codes.go` | 13 new activity codes (125-137) |
## HTTP routes added
All routes inherit the platform's auth middleware. Perms enforced inside `agentnetwork.Manager.requirePermission` (`manager.go:680-682`) on `modules.AgentNetwork`. Permission column shows the `op` passed to `requirePermission` — read = `Read`, etc.
| Method | Path | Perm | Handler |
| ------ | ---- | ---- | ------- |
| GET | `/agent-network/catalog/providers` | authn only | `providers_handler.go:43` |
| GET | `/agent-network/providers` | read | `providers_handler.go:57` |
| POST | `/agent-network/providers` | create | `providers_handler.go:97` |
| GET | `/agent-network/providers/{providerId}` | read | `providers_handler.go:77` |
| PUT | `/agent-network/providers/{providerId}` | update | `providers_handler.go:132` |
| DELETE | `/agent-network/providers/{providerId}` | delete | `providers_handler.go:172` |
| GET | `/agent-network/policies` | read | `policies_handler.go:32` |
| POST | `/agent-network/policies` | create | `policies_handler.go:72` |
| GET | `/agent-network/policies/{policyId}` | read | `policies_handler.go:52` |
| PUT | `/agent-network/policies/{policyId}` | update | `policies_handler.go:102` |
| DELETE | `/agent-network/policies/{policyId}` | delete | `policies_handler.go:142` |
| GET | `/agent-network/guardrails` | read | `guardrails_handler.go:25` |
| POST | `/agent-network/guardrails` | create | `guardrails_handler.go:65` |
| GET | `/agent-network/guardrails/{guardrailId}` | read | `guardrails_handler.go:45` |
| PUT | `/agent-network/guardrails/{guardrailId}` | update | `guardrails_handler.go:95` |
| DELETE | `/agent-network/guardrails/{guardrailId}` | delete | `guardrails_handler.go:135` |
| GET | `/agent-network/budget-rules` | read | `budget_handler.go:24` |
| POST | `/agent-network/budget-rules` | create | `budget_handler.go:64` |
| GET | `/agent-network/budget-rules/{ruleId}` | read | `budget_handler.go:44` |
| PUT | `/agent-network/budget-rules/{ruleId}` | update | `budget_handler.go:95` |
| DELETE | `/agent-network/budget-rules/{ruleId}` | delete | `budget_handler.go:135` |
| GET | `/agent-network/settings` | read | `settings_handler.go:53` (200+`null` if no row) |
| PUT | `/agent-network/settings` | update | `settings_handler.go:27` |
| GET | `/agent-network/consumption` | read | `consumption_handler.go:21` |
## gRPC RPCs added (or modified)
| RPC | Direction | Trigger |
| --- | --------- | ------- |
| `CheckLLMPolicyLimits` | proxy→mgmt unary | Pre-flight gate; returns allow/deny, selected policy, attribution group, window, deny code+reason (`proxy.go:259-301`). `Unimplemented` when limits service is nil. |
| `RecordLLMUsage` | proxy→mgmt unary | Post-flight write of tokens+cost against policy-window dimensions + every applicable account budget rule (`proxy.go:303-349`). `window_seconds==0` ⇒ no policy cap, only account fan-out runs. |
| `GetMappingUpdate`/`SendServiceUpdate` (stream) | mgmt→proxy | Snapshot (`proxy.go:752-780`) now appends `SynthesizeServicesForCluster`. Live updates use `SendServiceUpdateToCluster` + `shallowCloneMapping`. |
## Architecture & flow
### HTTP request lifecycle
```mermaid
sequenceDiagram
participant DB as Dashboard
participant R as gorilla.Router (/api)
participant H as handler (agentnetwork)
participant M as agentnetwork.Manager
participant S as store.Store
participant AM as accountManager (StoreEvent)
DB->>R: POST /api/agent-network/providers
R->>H: createProvider (auth mw sets UserAuth)
H->>H: GetUserAuthFromContext + validate(req)
H->>M: CreateProvider(userID, provider, bootstrapCluster)
M->>M: requirePermission(AgentNetwork, Create)
M->>S: SaveAgentNetworkProvider
M->>AM: StoreEvent(AgentNetworkProviderCreated)
M-->>H: created provider
H-->>DB: 200 + api.AgentNetworkProvider JSON
```
### Synth-service delivery via gRPC
```mermaid
sequenceDiagram
participant P as Proxy
participant G as ProxyServiceServer
participant SM as service.Manager (persisted)
participant SA as synthesizerAdapter
participant AN as SynthesizeServicesForCluster
participant ST as store.Store
Note over P,G: Initial snapshot
P->>G: GetMappingUpdate (stream open)
G->>SM: GetServicesForCluster(conn.address)
SM-->>G: persisted []*Service
G->>SA: SynthesizeServicesForCluster(conn.address)
SA->>AN: SynthesizeServicesForCluster(store, clusterAddr)
AN->>ST: walk every account; read providers/policies/settings
AN-->>SA: in-memory []*Service
SA-->>G: []*Service
G->>P: response (persisted + synth)
Note over G,P: Per-request live update
G->>G: SendServiceUpdateToCluster(update, clusterAddr)
G->>G: shallowCloneMapping(update) %% Private MUST survive
G->>P: response with single mapping
```
End-to-end: HTTP write persists rows and emits an activity event; the manager then triggers `proxyController.SendServiceUpdate` so proxies re-render. **The snapshot path is the only one that calls into the synthesiser** — on stream open it pulls persisted services then appends synth services for the cluster. Synth services are never persisted. For OIDC/session/tunnel-peer flows, `getServiceByDomain` falls back to `SynthesizeServicesForCluster(clusterFromDomain(domain))` when persisted lookup misses (`proxy.go:1763-1793`). The network_map contribution is orthogonal: per-peer compute prepends the same synth services to `account.Services` before `InjectProxyPolicies`.
## Permissions model added
- `permissions/modules/module.go:22` adds `AgentNetwork Module = "agent_network"`, registered in `All` (`module.go:42`). Standard `operations.{Read,Create,Update,Delete}` matrix.
- Handlers don't call `permissionsManager` directly — they extract `UserAuth` and delegate to `agentnetwork.Manager`, which gates every mutation through `requirePermission` (`manager.go:168, 308, 549`, etc.). Confirm your role-set provider has `agent_network` rows for owner/admin/user/billing-admin before merging.
- `getCatalogProviders` (`providers_handler.go:43`) intentionally skips RBAC — catalog is global static data.
## Activity codes added
`activity/codes.go:244-274` adds Activities 125-137 + string/code mappings (`codes.go:428-444`), following `<domain>.<resource>.<action>` (e.g., `agent_network.provider.create`). Audit-log exporters / SIEM forwarders need to know the new codes.
## Invariants
- **Synth services are never persisted.** Snapshot appends after `serviceManager.GetServicesForCluster` (`proxy.go:761-770`); network_map prepends before `InjectProxyPolicies` (`controller.go:117-126`).
- **`shallowCloneMapping` must round-trip every `ProxyMapping` field except `AuthToken`** — `proxy_clone_test.go:50-58` enforces via `gproto.Equal`. The bug it guards: a missing `Private` made every MODIFIED arrive `private=false`, the proxy skipped `ValidateTunnelPeer`, `UserGroups` stayed empty, `llm_router` denied `no_authorised_provider`; a restart "fixed" it because the snapshot uses the original mapping.
- **Limit-window floor is 60s** (`policies_handler.go:189-220`); enabled cap with both per-group and per-user at zero is rejected. Budget rules reuse the same validator (`budget_handler.go:170`).
- **Manager is optional at boot.** `NewAPIHandler` registers routes only when non-nil (`handler.go:129`); `ProxyServiceServer` returns `Unimplemented` from both RPCs when limits service is unwired (`proxy.go:262-265, 306-309`).
- **Settings GET on an unbootstrapped account returns 200 + `null`** (`settings_handler.go:65-72`) — not 404.
## Things to scrutinize
### Correctness
- **`injectAllProxyPolicies` runs on every per-peer compute**: `controller.go:163, 309, 415, 681`. `sendUpdateAccountPeers` is the target of the buffered fan-out — synth runs once per debounced account-update tick **and** once per direct `UpdateAccountPeer`. Cost is O(providers + policies × users-per-group) per account under `LockingStrengthNone`. No per-account synth cache — verify it fits the buffer interval for your largest tenant.
- **`clusterFromDomain` strips at the first `.`** (`proxy.go:1784-1792`). A zero-dot domain returns `""` and the synth call walks every account. Confirm no path reaches this with a malformed/internal domain.
- **Account-budget `RecordConsumption` fans out even when `window_seconds == 0`** (`proxy.go:341-348`) — intentional. Verify the proxy never sends `RecordLLMUsage` for a request that wasn't actually allowed.
### Security
- Every handler extracts `UserAuth` via `nbcontext.GetUserAuthFromContext` before any work. Routes live behind the standard `/api` mux; bypass list is not extended.
- `CheckLLMPolicyLimits` / `RecordLLMUsage` ride the existing **proxy → mgmt** gRPC connection auth. No additional token check inside the RPCs — they trust the connection. Confirm the proxy-side token-verification interceptor in this package gates both.
- `RecordLLMUsage` only validates `account_id != ""` (`proxy.go:317-319`). A compromised proxy can attribute cost to any account in its cluster — was already true for prior RPCs but is louder now that data drives denials.
### Concurrency
- `SetAgentNetworkSynthesizer` / `SetAgentNetworkLimitsService` write under `s.mu.Lock`; read paths copy the interface under read lock (`proxy.go:236-247, 260-263, 304-307`). Same pattern as existing `serviceManager`/`proxyController` setters.
- Manager writes use `LockingStrengthUpdate`; synth reads use `LockingStrengthNone` — read-after-write via the proxy snapshot can observe a stale view by up to one fan-out tick.
- Network_map controller is single-threaded per account; cross-account is parallel.
### Backward compatibility
- `proxy_clone_test.go` is the regression net; any new `ProxyMapping` field must be cloned or explicitly nulled in the test.
- `AccessLogEntry` adds indexed `AgentNetwork bool` — implicit AutoMigrate; deploy story must handle table-rewrite cost on high-volume access-log tables.
- `TargetOptions` gains seven `omitempty` JSON fields (`service.go:69-94`); on-wire shape stays compatible. `targetOptionsToProto` tests all fields when deciding nil (`service.go:551-556`).
- `NewAPIHandler` signature changes — every caller must pass `agentNetworkManager`; `nil` is supported.
### Observability
- 13 new activity codes via `accountManager.StoreEvent` in the manager — confirm dashboard's audit-log UI maps them.
- `AccessLogEntry.AgentNetwork` is indexed for the dashboard's agent-network log filter.
- New RPCs log at error level on store/selector failures (`proxy.go:284, 327, 332, 348`). Snapshot synth failures degrade to warnings — stream is not aborted (`proxy.go:765`).
## Test coverage
| Test | Locks down |
| ---- | ---------- |
| `handlers_test.go::TestPolicyHandler_WindowSecondsRoundTrip` | GET carries `window_seconds`; legacy `window_hours`/`window_days` absent. |
| `handlers_test.go::TestPolicyHandler_RejectsSubMinuteWindow` | POST `<60s` returns 4xx. |
| `handlers_test.go::TestConsumptionHandler_EmptyAccountReturnsArray` | `/consumption` returns `[]` — never null. |
| `handlers_test.go::TestConsumptionHandler_PopulatedAccountListsRows` | RecordConsumption×2 surfaces both with correct tokens/cost/window. |
| `budget_handler_test.go::TestBudgetRuleHandler_RoundTrip` | Targets + PolicyLimits shape round-trip. |
| `budget_handler_test.go::TestBudgetRuleHandler_ListReturnsArray` | Empty-list shape. |
| `budget_handler_test.go::TestBudgetRuleHandler_{RejectsMissingName,RejectsSubMinuteWindow}` | Validation rejections are 4xx. |
| `budget_handler_test.go::TestSettingsHandler_GetExposesCollectionToggles` | All four toggles + computed `Endpoint`. |
| `proxy_clone_test.go::TestShallowCloneMapping_PreservesAllFieldsExceptAuthToken` | Future-proofs clone; every field round-trips, `AuthToken` dropped. |
Handler tests use a real sqlite store + real manager + always-allow permissions mock (`handlers_test.go:53-75`). Create/update/delete success paths flow through `accountManager.StoreEvent` which the fixture doesn't wire — covered by manager-level no-mock tests outside this module.
## Known limitations / explicit non-goals
- No pagination on any list endpoint; no bulk endpoints.
- Synth result is not cached — every snapshot and every per-peer compute repeats the store walk.
- `getSettings` returning `200 + null` is a deliberate dashboard concession.
- No rate-limiting beyond the global `/api` rate limiter.
## Cross-references
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md), [management/store](20-management-store.md)
- Downstream: [proxy/runtime](33-proxy-runtime.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)

View File

@@ -1,215 +0,0 @@
# proxy/middleware-framework — generic plugin system
> **Risk level:** **High** — every proxied request transits this chain. Budget exhaustion, panic recovery, or chain-close bugs hit the hot path for all targets, not just agent-network ones.
> **Backward-compat impact:** Additive at the proxy. The `middleware` and `bodytap` packages are new (`proxy/internal/middleware/middleware.go:1`, `proxy/internal/middleware/bodytap/request.go:13`); existing proxy targets keep working until a chain is bound to them via `Manager.Rebuild`.
This module is the **framework only** — no LLM/agent-network domain knowledge is required, since every example built into it is generic.
## Module boundary
This module is the **framework only**: slots, chains, registry, dispatcher, accumulator, body-tap, output filters. No middleware *implementation* lives here — those land in `proxy/internal/middleware/builtin/*` (covered in module 31). The package contract is:
1. The proxy hands a `Manager` to its config-apply path. The synth pushes per-path `PathTargetBinding` lists (`proxy/internal/middleware/manager.go:26`) into `Manager.Rebuild`, which resolves each spec via the `Registry`/`Resolver` (`proxy/internal/middleware/registry.go:81-121`) and produces an immutable `Chain` keyed by `serviceID|pathID` (`proxy/internal/middleware/manager.go:410-412`).
2. The reverse-proxy handler captures the request body via `bodytap.CaptureRequest`, calls `Chain.RunRequest`, applies returned mutations (already filtered by `chain.applyMutations`), forwards to the upstream behind a `bodytap.CapturingResponseWriter`, then calls `Chain.RunResponse` and `Chain.RunTerminal`.
3. Middlewares are inert plugins that receive a deep-cloned `Input` and return an `Output` whose decision/mutations are clamped by the dispatcher's `filterOutput` (`proxy/internal/middleware/dispatcher.go:149-172`).
Everything that crosses the framework boundary in either direction is value-typed and deep-copied — middlewares cannot mutate the live request directly, and the framework cannot inadvertently leak middleware-owned slices into the request hot path.
## Files
| Path | Role |
| ---- | ---- |
| `proxy/internal/middleware/middleware.go` | `Middleware` + `Factory` interfaces. |
| `proxy/internal/middleware/types.go` | `Slot`, `FailMode`, `Decision`, all limit constants, `Input`/`Output`/`Mutations`/`UpstreamRewrite`/`AuthHeader` value types. |
| `proxy/internal/middleware/spec.go` | Apply-time `Spec` (validated wire shape + runtime-injected fields) and `Clone`. |
| `proxy/internal/middleware/registry.go` | `Registry` (factory map, RWMutex) and `Resolver` (Spec → bound `Middleware`). |
| `proxy/internal/middleware/manager.go` | `Manager`, `chainTable` reverse index, `Rebuild`/`Invalidate*`, async chain close. |
| `proxy/internal/middleware/chain.go` | `Chain.RunRequest`/`RunResponse`/`RunTerminal`, mutation gating, `cloneInputFor`. |
| `proxy/internal/middleware/chain_test.go` | Metadata threading, LIFO response order, rewrite gating, UserGroups propagation, terminal accumulation. |
| `proxy/internal/middleware/dispatcher.go` | Timeout/panic recovery, fail-mode, error classification, `filterOutput`. |
| `proxy/internal/middleware/decision.go` | `RenderDenyResponse`, deny-code regex, status clamp. |
| `proxy/internal/middleware/headerpolicy.go` | Compile-in header denylist + `FilterHeaderMutations`. |
| `proxy/internal/middleware/bodypolicy.go` | `ValidateBodyReplace` / `ApplyBodyReplace` smuggling guards. |
| `proxy/internal/middleware/keys.go` | Metadata key namespace constants. |
| `proxy/internal/middleware/metadata.go` | `Accumulator` — allowlist, per-mw/per-request byte caps, redaction. |
| `proxy/internal/middleware/metrics.go` | OTel instrument bundle (`proxy.middleware.*`). |
| `proxy/internal/middleware/redaction.go` | `Scan` — PEM/JWT/AWS/bearer/Luhn-validated CC patterns. |
| `proxy/internal/middleware/bodytap/request.go` | Capture + replay reader, `Budget` semaphore, bypass reason codes. |
| `proxy/internal/middleware/bodytap/response.go` | `CapturingResponseWriter` (tee with `PassthroughWriter` for Flusher/Hijacker preservation). |
## Slot model
Three slots, declared per-middleware exactly once (`proxy/internal/middleware/types.go:27-41`):
- **`SlotOnRequest`** (`Slot=1`) — runs **before** the upstream call, in registration order. May `DecisionDeny`, may emit `Mutations` (header add/remove, body replace, `UpstreamRewrite`) when both `Spec.CanMutate` and `Middleware.MutationsSupported()` are true. May emit metadata. Each middleware in the slot sees metadata that earlier ones in the same slot just emitted (`proxy/internal/middleware/chain.go:144-178`) — this is how the framework gives middlewares an intra-slot side channel without a global bag.
- **`SlotOnResponse`** (`Slot=2`) — runs **after** the upstream returns, in **reverse** registration order. Cannot deny (clamped in `dispatcher.filterOutput`, `proxy/internal/middleware/dispatcher.go:153-157`). May still mutate response headers in principle, but the current chain only forwards `RewriteUpstream` from on_request, so on_response mutations are observe-only in practice. Threads the same per-slot metadata view as on_request.
- **`SlotTerminal`** (`Slot=3`) — runs **after** every on_response middleware has emitted, in registration order. Sees the full accumulated bag plus prior terminal emissions (`chain.go:221-245`). Cannot deny, cannot mutate (`dispatcher.go:168-170`). Designed for sinks (access log, metrics push, audit emitter).
Splitting a feature across slots (e.g. "parse on the way out, ship on terminal") is the explicit architectural choice — `types.go:7-15` and `types.go:22-25` make it clear no middleware participates in more than one slot.
## Architecture & flow
### Chain dispatch
```mermaid
sequenceDiagram
autonumber
participant H as proxy HTTP handler
participant BT as bodytap.CaptureRequest
participant CH as Chain
participant DI as Dispatcher
participant MW as Middleware (per slot)
participant US as Upstream
participant CW as CapturingResponseWriter
H->>BT: CaptureRequest(r, cfg, budget)
BT-->>H: body[], truncated, release()
H->>CH: RunRequest(ctx, r, Input, Accumulator)
loop on_request, registration order
CH->>CH: cloneInputFor(in, OnRequest)
CH->>DI: Invoke(ctx, spec, mw, call)
DI->>MW: mw.Invoke(callCtx, in)
MW-->>DI: Output{decision, metadata, mutations?}
DI->>DI: filterOutput (clamp deny, gate mutations)
DI-->>CH: filtered Output
CH->>CH: Accumulator.Emit (allowlist + caps + redact)
alt DecisionDeny
CH-->>H: denied, merged, rewrite
else allow
CH->>CH: applyMutations(r, m) and capture rewrite
end
end
CH-->>H: nil, merged, rewrite
H->>US: ProxyRequest (with rewrite/mutations applied)
US-->>CW: bytes (streamed, tee'd into cap-bounded buf)
CW-->>H: passthrough complete
H->>CH: RunResponse(ctx, Input{RespBody:CW.Body(),...}, acc)
loop on_response, REVERSE order (LIFO)
CH->>DI: Invoke (same wrappers)
end
H->>CH: RunTerminal(ctx, Input{Metadata:full bag}, acc)
H->>BT: release() + CW.Release()
```
### Body-tap mechanics (request + response)
```mermaid
flowchart LR
subgraph req[Request capture — bodytap.CaptureRequest]
R0[r.Body] --> R1{cfg.MaxRequestBytes > 0?\nUpgrade absent?\nContent-Type allowed?\nCL <= cap?}
R1 -- no --> R2[bypass = reason\nbody = nil\nr.Body untouched]
R1 -- yes --> R3[Budget.Acquire(cap)]
R3 -- denied --> R4[bypass=BypassBudget]
R3 -- ok --> R5[io.LimitReader(r.Body, cap+1)\nio.ReadAll]
R5 --> R6{len > cap?}
R6 -- truncated --> R7[viewable = buf[:cap]\nr.Body = replayReadCloser{buf, tail}]
R6 -- whole --> R8[r.Body = NopCloser(bytes.Reader(buf))\nclose original]
R7 --> R9[(release captured\nbudget on req end)]
R8 --> R9
end
subgraph resp[Response capture — CapturingResponseWriter]
W0[client] -.-> CW[Write(p)]
CW --> P1[PassthroughWriter.Write(p)\n— bytes leave to client first]
P1 --> P2{!stopped?}
P2 -- yes --> P3{remaining = cap - buf.Len()}
P3 --> P4[buf.Write(p[:take])\nset truncated if take<n]
P2 -- no --> P5[silent drop into the tee\n(client write already done)]
end
```
The body-tap is the highest-leak-risk surface in this module; three details matter:
1. **Request capture is "read-and-replay", not "read-and-forward".** `CaptureRequest` always swaps `r.Body` for either a `bytes.Reader` (whole body fit) or a `replayReadCloser` that replays the captured prefix then drains the remaining stream from the original body (`bodytap/request.go:178-201`). This means the **upstream still sees the full body even when the tap truncates**. The original `r.Body` is **not** closed in the truncated branch — `replayReadCloser.Close()` only closes the tail (`bodytap/request.go:199-201`), which is the same reader, so close once on request end is correct, but reviewers should confirm the upstream proxy always reads to EOF (otherwise the tail is leaked).
2. **Response capture is a write-through tee.** `CapturingResponseWriter.Write` forwards to the underlying writer **first** (`bodytap/response.go:116-117`), then tees into `buf` under its own mutex. Client never blocks on the tee. `Flusher`/`Hijacker` are preserved via the embedded `responsewriter.PassthroughWriter`. SSE/chunked streams flow through untouched; middlewares only see the bounded prefix.
3. **Budget is a single shared semaphore.** `Manager` constructs one `bodytap.Budget` at startup (`manager.go:138-144`, default `256 MiB` from `bodytap/request.go:39`). Every capture pre-acquires its full `MaxRequestBytes` / `MaxResponseBytes` from the budget regardless of actual body size; that prevents a flood of small captures from collectively exceeding the cap, but it also means a misconfigured `MaxRequestBytes = 1 MiB` with 256 concurrent requests already exhausts the default budget. Reviewers should sanity-check the operator-facing defaults that ship with synth-service.
The framework explicitly aborts capture (and increments `proxy.middleware.capture_bypass_total`) before reading the first byte when `Upgrade`/`Connection: upgrade` is set (`bodytap/request.go:120-125`), when the content-type isn't in the allowlist (`bodytap/request.go:126-128`), or when the advertised `Content-Length` already exceeds the cap (`bodytap/request.go:131-133`). This is the right place to make sure WebSocket upgrades and large file uploads never reach the buffer.
## Public contracts
- **`Middleware` interface** (`middleware.go:14-36`): `ID()`, `Version()`, `Slot()`, `AcceptedContentTypes()`, `MetadataKeys()`, `MutationsSupported()`, `Invoke(ctx, *Input) (*Output, error)`, `Close()`. `MetadataKeys()` is the **closed set** the middleware is allowed to emit — the accumulator drops anything outside it (`metadata.go:71-75`). `Close` must be idempotent (called even when `Invoke` was never reached).
- **`Factory` interface** (`middleware.go:44-47`): `ID()`, `New(rawConfig []byte) (Middleware, error)`. `RawConfig` is opaque JSON bytes on the wire (`spec.go:6-12`); each factory owns its own typed config.
- **`Decision` type** (`types.go:59-69`): `Allow=0`, `Deny=1`, `Passthrough=2`. Default-zero is permissive — important because every middleware that omits `Decision` gets `Allow`. Dispatcher clamps `Deny` to `Passthrough` outside `SlotOnRequest` (`dispatcher.go:153-157`).
- **`Mutations`** (`types.go:196-201`): `HeadersAdd`/`HeadersRemove` (filtered through `headerpolicy.go`), `BodyReplace` (gated through `bodypolicy.go`), and `RewriteUpstream`. `RewriteUpstream` is **last-write-wins** within the on_request slot (`chain.go:170-172`, locked down by `TestChain_RunRequest_LatestRewriteWins`).
- **Metadata propagation keys** (`keys.go`): all keys live in a single file and follow `^[a-z][a-z0-9_-]*(\.[a-z0-9_-]*)+$` (`metadata.go:8`). Framework-injected error tagging uses `mw.<id>.error_kind` (`keys.go:81`) so operators can distinguish framework-emitted entries from middleware-emitted ones.
## Invariants
- **Per-request context isolation.** `cloneInputFor` deep-copies every mutable field (`Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames`) before each invocation (`chain.go:286-308`). A misbehaving middleware that mutates `in.Headers` only corrupts its own copy.
- **Body-tap bounded by capture limit.** Request side uses `io.LimitReader(r.Body, limit+1)` (`bodytap/request.go:152`) — the `+1` is how the code detects truncation (`bodytap/request.go:160`); the surfaced buffer is sliced back down to `limit`. Response side stops teeing once `buf.Len() >= cap` (`bodytap/response.go:121-133`). Neither side can grow the buffer past the configured cap.
- **Headers/body redaction order.** Accumulator runs `Scan(value)` **before** counting cost (`metadata.go:81-82`), so the byte budgets are computed against post-redaction sizes. `Scan` order is PEM → JWT → AWS key → bearer → Luhn-validated CC (`redaction.go:25-51`) — the comment block in `redaction.go:8-13` is explicit that this is best-effort, not DLP.
- **No middleware can starve the chain.** Every invocation runs inside `context.WithTimeout(ctx, clampTimeout(spec.Timeout))` in a separate goroutine (`dispatcher.go:51-94`), with the deadline race-`select`ed against the result channel. A blocked middleware fires the timeout path, gets fail-mode'd, and `IncError(kind=timeout)`. Timeouts are clamped to `[10ms, 5s]` (`types.go:80-86`, `dispatcher.go:174-185`).
- **Panic recovery.** `recover()` captures the panic, logs only the type + a 4 KiB stack prefix (no panic value — avoids leaking secrets the middleware was processing), and produces a `panicError` that flows through fail-mode (`dispatcher.go:64-76`).
- **Chain immutability + atomic swap.** `chainTable` is cloned on every `Rebuild`/`Invalidate*` and swapped via `atomic.Pointer` (`manager.go:44-69`, `manager.go:221-300`). Readers (`ChainFor`) are lock-free; writers serialise on `writeMu`. The retired chain is `Close`-d in a background goroutine bounded by `chainCloseTimeout = 2 * MaxTimeout` (`manager.go:21-22`, `manager.go:326-346`), so in-flight invocations finish on the old chain after the swap.
## Things to scrutinize
### Correctness
- **Chain ordering deterministic from synth output?** `Manager.buildChain` iterates `b.Specs` in slice order and appends to `bound` (`manager.go:366-391`); `NewChain` then partitions by slot but **preserves slice order within each slot** (`chain.go:50-60`). So order on the wire = order observed at runtime. Synth must therefore emit specs in the intended execution order — there is no per-spec `Priority` field. Worth flagging.
- **Decision short-circuit semantics.** `RunRequest` returns immediately on `DecisionDeny` (`chain.go:164-167`) **with the metadata accumulated so far** plus the `denied.Metadata`. Callers that ignore `merged` on deny will lose framework-injected `mw.<id>.error_kind` entries. The proxy runtime is the only caller; confirm it always feeds `merged` into the access log on the deny path as well.
- **`UpstreamRewrite` `AuthHeader` bypass** (`types.go:218-235`). The `AuthHeader`/`StripHeaders` fields *intentionally* bypass the header denylist on the basis that the proxy itself rewrites auth. The denylist still blocks middleware-emitted `HeadersAdd: Authorization=...`. This is a delicate carve-out — review the runtime consumer to confirm only the trusted upstream-build path unpacks `AuthHeader`, never the generic `applyMutations` loop.
- **`replayReadCloser.Close` only closes the tail** (`bodytap/request.go:199-201`). The replay buffer doesn't own a resource, so this is correct, but it conflates "replay finished" with "underlying body closed". If a caller `Close()`s without reading to EOF, the original body is closed but the captured prefix is lost; harmless for the proxy path (upstream always reads to EOF) but worth a doc-comment.
### Security
- **Body-tap memory bounds.** Discussed above — bounded by `MaxBodyCapBytes = 1 MiB` per direction (`types.go:77`) and the shared `Budget` (default 256 MiB). The concerning case is the **deep-copy in `cloneInputFor`** (`chain.go:300-306`): every middleware invocation gets its **own copy** of `Body` and `RespBody`. A chain of N middlewares with a 1 MiB body allocates N MiB of transient bytes per request. With `MaxMiddlewaresPerChain = 16` (`types.go:103`) that's up to 16 MiB extra per in-flight request. Worth pricing into the budget model.
- **Header redaction completeness.** `denyHeaders` (`headerpolicy.go:5-17`) covers the auth/forwarding family and framing (`Content-Length`, `Transfer-Encoding`, `Trailer`). `denyHeaderPrefixes` covers `X-Authenticated-*`, `X-Forwarded-*`, `X-Remote-*`, `X-NetBird-*`. Notably absent: `Range`, `If-Match`/`If-None-Match` (mutation could cause cache poisoning), `Origin`/`Referer`. Not necessarily wrong, but worth a deliberate decision.
- **Metadata key collisions across middlewares.** The accumulator has no cross-middleware uniqueness check; two middlewares with the same key in their allowlist can both emit it, and both copies land in `merged` (`metadata.go:51-99`). Downstream consumers must tolerate duplicates. Worth documenting.
- **Deny rendering.** `RenderDenyResponse` only allows codes matching `^[a-z][a-z0-9._-]{0,63}$` (`decision.go:9`), redacts/truncates message + detail values, caps `Details` at 8 entries (`decision.go:42-50`), clamps status to `[400,499]\{401}` (`decision.go:65-73`). The deny body type is fixed; middlewares cannot inject arbitrary JSON.
### Concurrency
- **Per-request state vs shared state in factories.** Each `Factory.New` is called once per chain build; the returned `Middleware` instance is **shared across all requests** for that chain. `Invoke` must be reentrant. The framework does not enforce this — a buggy middleware that holds per-call state on the struct will silently race. Suggest a `// Invoke must be safe for concurrent use` doc on the interface.
- **`chainTable` clone-on-write** is correct, but `addChain`/`removeChain` mutate the *cloned* table before the swap (`manager.go:71-108`), and they're called under `writeMu`. Readers only ever see the post-swap pointer. Good.
- **`Chain.inflight` WaitGroup**. `Run*` does `Add(1)`/`Done()` (`chain.go:142-143`, `chain.go:194-195`, `chain.go:225-226`); `Close` waits on it bounded by ctx (`chain.go:75-85`). One concern: a *new* `RunRequest` can `Add(1)` *after* `Close` started waiting if the caller still holds a stale chain pointer. `WaitGroup` does not panic on this if the count was already > 0 at `Wait` time, but it does panic if `Add` happens after `Wait` returns and another `Wait` runs. `Close` is documented one-shot, so single-`Wait` is fine, but callers must drop the chain reference before calling `Close`. Worth a code comment near `Close`.
- **Goroutine leaks.** `Dispatcher.Invoke` spawns one goroutine per call and *always* writes to a buffered (cap=1) channel (`dispatcher.go:62-76`), so even if the timeout fires the goroutine completes its send and exits. No leak.
- **`closeChainsAsync`** detaches retired chains into a goroutine (`manager.go:326-346`). If `Manager` is never GC'd this is fine, but there's no shutdown hook to wait on outstanding closes. Reviewers should confirm the proxy shutdown path explicitly drains in-flight requests before tearing down `Manager`, or accept that the last chain-close round may be cut short on exit.
### Performance
- **Allocations per request.** `cloneInputFor` allocates new slices for `Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames` — once per middleware per request. For a typical 5-middleware chain on a 1 KiB body that's ~10 small slice allocs plus one `Body` copy each. Not a hot-path crisis, but `sync.Pool` for the per-call `Input` would be a natural follow-up.
- **Accumulator allocates a fresh `allowSet` per `Emit` call** (`metadata.go:55-58`). One per middleware per slot pass = up to 48 per request. Cheap, but worth noting.
- **Regex cost.** `Scan` runs five regex passes on every accepted metadata value (`redaction.go:25-51`). Bounded by `MaxMetadataValueBytes = 4 KiB` so worst case is small.
### Observability
- **Per-middleware metrics.** `proxy.middleware.requests_total{middleware,target_id,outcome}` (`metrics.go:34-41`), `duration_ms`, `invocations_total`, `errors_total{kind}`, `metadata_rejected_total{reason}`, `header_mutation_blocked_total{header}`, `capture_bypass_total{reason}`. Comprehensive surface; operators can alert on `errors_total{kind=panic}` and `errors_total{kind=timeout}` separately. **Latency histogram is in milliseconds with default OTel buckets** — for a 10ms5s timeout range default buckets cover OK, but a custom bucket set centred on 1500ms would resolve the agent-network response-parser tail better.
- **Decision logs.** Panic logs (`dispatcher.go:69`) include `request_id`, type, and stack but not the panic value (safe). `Chain.Close` logs middleware-close errors at debug (`chain.go:91`). `applyMutations` logs body-replace rejections at warn (`chain.go:278`). No log on the deny path itself — by design, since the access-log terminal middleware is expected to record outcomes.
## Test coverage
| Test file | Locks down |
| --------- | ---------- |
| `proxy/internal/middleware/chain_test.go:77` | `RunRequest` threads metadata across on_request middlewares (regression for the "later mw can't see earlier mw's emissions" bug). |
| `chain_test.go:110` | `RunResponse` reverse-order threading. |
| `chain_test.go:142` | `cost_meter`-shaped scenario: response_parser registered after cost_meter still emits *before* cost_meter sees the bag (guards the `cost.skipped=missing_tokens` regression). |
| `chain_test.go:178` | `UpstreamRewrite` last-write-wins. |
| `chain_test.go:206` | No middleware emits → nil rewrite. |
| `chain_test.go:224` | Rewrite filtered when `CanMutate=false`. |
| `chain_test.go:245` | `Input.UserGroups` propagates verbatim through `cloneInputFor`. |
| `chain_test.go:304` | Terminal middlewares see the full accumulated bag + prior terminal emissions. |
**Gaps** worth raising with the author:
- No direct test for `Dispatcher.Invoke` timeout / panic / fail-mode behaviour at the framework level (covered indirectly by built-in tests, but a unit test pinning `errors_total{kind=...}` labels would be cheap insurance).
- No test for `bodytap.CaptureRequest` truncated replay (the upstream-sees-full-body invariant is exactly the kind of thing a regression would silently break).
- No test for `Budget` exhaustion behaviour under concurrency.
- No test for `Manager.InvalidateMiddleware` + `LiveServiceCheck` race (the auth-revocation race the comment at `manager.go:33-38` calls out is the load-bearing reason for `LiveServiceCheck`).
## Known limitations / explicit non-goals
- **No middleware-to-middleware RPC.** Side-channel is metadata only.
- **No streaming body inspection.** Middlewares see a bounded prefix; SSE / chunked parsing happens against that prefix in the response middleware.
- **No per-spec priority.** Order is registration order in the spec slice.
- **No retry / circuit-breaker** on middleware errors. Fail-mode is binary (open/closed) and per-spec.
- **Mutations cannot rewrite the request URL path or query** — only `RewriteUpstream` can change scheme/host (+ optional path replacement, see `types.go:218-235`).
- **Redaction is best-effort.** Explicitly documented in `redaction.go:8-13`. Not a DLP solution.
## Cross-references
- Upstream wire shape: [../modules/10-shared-api.md](10-shared-api.md) (Spec/RawConfig encoding from management).
- Built-in middlewares using this framework: [../modules/31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md).
- Runtime wiring (where `Manager`, `Chain`, and `bodytap` are consumed by the HTTP handler): [../modules/33-proxy-runtime.md](33-proxy-runtime.md).
- End-to-end request flow including capture + chain dispatch: [../01-end-to-end-flows.md](../01-end-to-end-flows.md).
- Top-level architecture: [../00-overview.md](../00-overview.md).

View File

@@ -1,365 +0,0 @@
# proxy/middleware-builtin — the LLM chain
The registry-mounted middleware set the proxy executes on every agent-network
LLM request. The two highest-blast-radius areas are the **capture-pointer
semantics** and the **limit_check ⇒ limit_record** record-once invariant.
Sibling module: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — the SDK
adapters + pricing catalog this chain delegates to.
---
## Module boundary
This module is the registry-mounted middleware set the proxy executes on
every agent-network LLM request. Each sub-package registers itself via
`init()`
([builtin.go:3234](../../../proxy/internal/middleware/builtin/builtin.go));
the proxy server anonymous-imports the set
([all_test.go:1119](../../../proxy/internal/middleware/builtin/all_test.go))
so the registry is populated at boot. The chain is wired by the management
synthesiser and executed by the framework
(`proxy/internal/middleware/{chain,dispatcher,accumulator}.go` — both out
of scope). Everything here reads from / writes to one envelope: the
`middleware.KV` metadata bag plus `middleware.Mutations` for header/body
rewrites.
## The 8 middlewares
| Name | Slot | Inputs (metadata read) | Outputs (metadata written) | Side effects |
|---|---|---|---|---|
| `llm_request_parser` | OnRequest | `Input.{URL,Body,BodyTruncated}` | `llm.{provider,model,stream,request_prompt_raw,capture_truncated}` | none |
| `llm_router` | OnRequest | `llm.model`, `Input.{URL,UserGroups}` | `llm.{resolved_provider_id,authorising_groups}`, `llm_policy.{decision,reason}` | upstream rewrite + auth strip/inject |
| `llm_limit_check` | OnRequest | `llm.{resolved_provider_id,model}`, `Input.{AccountID,UserID,UserGroups}` | `llm.{selected_policy_id,attribution_group_id,attribution_window_seconds}`, `llm_policy.{decision,reason}` | gRPC `CheckLLMPolicyLimits` |
| `llm_identity_inject` | OnRequest | `llm.{resolved_provider_id,authorising_groups}`, `Input.{UserEmail,UserID,UserGroups,UserGroupNames}` | none | header strip/inject + optional body rewrite |
| `llm_guardrail` | OnRequest | `llm.{model,request_prompt_raw}` | `llm_policy.{decision,reason}`, `llm.request_prompt` | none (model allowlist deny) |
| `llm_response_parser` | OnResponse | `llm.provider`, `Input.{RespHeaders,RespBody,Status}` | `llm.{input,output,total,cached_input,cache_creation}_tokens`, `llm.response_completion` | none |
| `cost_meter` | OnResponse | `llm.{provider,model}`, token buckets | `cost.usd_total` or `cost.skipped` | pricing lookup |
| `llm_limit_record` | OnResponse | `llm.{attribution_group_id,attribution_window_seconds,input_tokens,output_tokens}`, `cost.usd_total` | none | gRPC `RecordLLMUsage` |
[all_test.go:2640](../../../proxy/internal/middleware/builtin/all_test.go)
locks the ID set; adding or removing one is a conscious extension.
## Files
| File | LOC | Notes |
|---|---:|---|
| `builtin.go` | 86 | Registry + `FactoryContext` (ctx, data dir, meter, logger, mgmt client) |
| `all_test.go` | 41 | Locks the 8-ID registry surface |
| `agentnetwork_chain_integration_test.go` | 319 | Live sqlite + real gRPC bufconn; gate→recorder wire path |
| `llm_request_parser/*` | 162 / 66 / 356 | Provider detection, body parse, prompt extraction with capture-pointer gating |
| `llm_router/*` | 385 / 84 / 586 | Three-pass route selection (model → groups → path-prefix) |
| `llm_limit_check/*` | 196 / 38 / 182 | Pre-flight `CheckLLMPolicyLimits` (2s, fail-open) |
| `llm_identity_inject/*` | 440 / 108 / 666 | HeaderPair (LiteLLM) + JSONMetadata (Portkey) + ExtraHeaders |
| `llm_guardrail/*` | 176 / 82 / 75 / 219 / 217 | Model allowlist + optional prompt capture with PII redaction |
| `llm_response_parser/*` | 258 / 222 / 43 / 433 / 169 / 111 | Buffered + SSE accumulation; AWS event-stream accumulator (`streaming_bedrock.go`) for Bedrock; capture-pointer gates completion emit |
| `cost_meter/*` | 181 / 84 / 439 | Token → USD via `proxy/internal/llm/pricing` |
| `llm_limit_record/*` | 144 / 35 / 191 | Post-flight `RecordLLMUsage` (5s, debug-on-error) |
## Per-middleware
### llm_request_parser
Detects the LLM provider via `llm.DetectParser` (URL sniff) or by name via
`llm.ParserByName` when synthesiser stamps `provider_id`
([middleware.go:9699](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
**Path-routed providers short-circuit first:** `parseVertexPath` and
`parseBedrockPath` ([middleware.go:8594](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
pull the model + vendor out of the URL before parser selection runs — Vertex
from `/v1/projects/.../publishers/{pub}/models/{model}:{action}` (publisher →
vendor via `vertexPublisherVendor`), Bedrock from `/model/{id}/{action}` with
`normalizeBedrockModel` stripping the region prefix + version suffix. See
[50-path-routed-providers.md](./50-path-routed-providers.md) for the full path
grammar. For body-routed providers it decodes the body into `RequestFacts`
(model + stream) and extracts the prompt. On
`capture_prompt=true` (or absent — see capture-pointer semantics below) the
prompt is run through `llm_guardrail.RedactPII` when `redact_pii=true` and
truncated rune-safely to 3500 bytes
([middleware.go:109122](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
**Key invariant:** redaction is parser-side, not guardrail-side — access-log
reads `llm.request_prompt_raw` directly.
### llm_router
Three-pass route selection in `matchRoute`
([middleware.go:241300](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
filter by `Models` claim → vendor-pin (a vendor-tagged request never crosses to
another vendor's route) → filter by `AllowedGroupIDs` intersection → model
precedence over path → tie-break by longest `UpstreamPath` prefix match.
Model-miss returns `llm_policy.model_not_routable`; known-but-unauthorised
returns `llm_policy.no_authorised_provider`. **Key invariant:** auth-header
strip+inject rides on `UpstreamRewrite.{StripHeaders,AuthHeader}`
([middleware.go:606646](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
— NOT `HeadersAdd/HeadersRemove` — because the framework's mutation gate
blocks `Authorization` on the generic header path.
**Path-routed providers route before the model table.** `Invoke` checks
`isVertexPath` / `isBedrockPath`
([middleware.go:138216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
ahead of the model lookup, so a path-carried model can't be claimed by a
same-vendor body-routed provider. `matchPathRoute` enforces the route's `Models`
allowlist (empty = catch-all) even though the model came from the URL.
Two path-only behaviours:
- **Vertex unmeterable publisher** — when `llm_request_parser` emits no
`llm.provider` (e.g. Gemini/`google`), the router denies with
`llm_policy.unmeterable_publisher` (403) rather than forward it uncounted.
- **GCP token minting** — when the route carries `GCPServiceAccountKeyB64`
(set from a `keyfile::` api_key), `gcpBearer` mints + caches a short-lived
OAuth2 token per request instead of injecting a static value; a bad key or
unreachable token endpoint denies with `llm_policy.upstream_auth_failed`
(502). Bedrock uses its static bearer token directly (no minting).
- **`/bedrock` prefix** — an optional `/bedrock` gateway-namespace prefix is
accepted and stripped via `RewriteUpstream.StripPathPrefix` so the native
`/model/...` path reaches the upstream.
Full treatment in [50-path-routed-providers.md](./50-path-routed-providers.md).
### llm_limit_check
Pre-flight gate. Reads `llm.resolved_provider_id`, calls
`CheckLLMPolicyLimits` with a 2s context timeout
([middleware.go:24, 97106](../../../proxy/internal/middleware/builtin/llm_limit_check/middleware.go)),
on allow stamps `llm.selected_policy_id`, `llm.attribution_group_id`,
`llm.attribution_window_seconds`. **Key invariant:** fail-open. Nil
`MgmtClient`, empty provider id, or RPC error returns `allowNoAttribution()`
— management outage doesn't take down every LLM request. Operators audit via
the access-log; a future flag may switch this to fail-closed.
### llm_identity_inject
Dispatches per-rule between LiteLLM-shaped `HeaderPair`
([middleware.go:169](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
and Portkey-shaped `JSONMetadata`
([middleware.go:292](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go)).
Identity is the peer's email (or `UserID` fallback); tags are the
**authorising-groups intersection** emitted by `llm_router`, not the full
`UserGroups` — a peer in 5 groups authorised under 1 only tags as that 1.
**Anti-spoof:** every `HeadersAdd` is preceded by a `HeadersRemove` of the
same name; the framework runs `Remove` before `Add` so client-supplied
identity never reaches the upstream. Body-level inject (`tags_in_body`,
`end_user_id_in_body`) is skipped on empty / truncated / non-JSON bodies so
header attribution stays intact.
### llm_guardrail
Model allowlist deny + optional prompt-capture-with-redaction. Allowlist
match is case-insensitive via `normaliseModel`; empty allowlist disables the
check. Prompt capture reads `llm.request_prompt_raw` and emits
`llm.request_prompt` only when `prompt_capture.enabled`
([middleware.go:149165](../../../proxy/internal/middleware/builtin/llm_guardrail/middleware.go)).
**Key invariant:** `RedactPII` is the exported function the parsers call —
single PII contract across all three keys.
### llm_response_parser
Buffered and SSE paths share one `Invoke`
([middleware.go:102127](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go)):
content-type sniffing dispatches to `invokeBuffered` (JSON, status<400) or
`invokeStreaming` (text/event-stream, partial bodies tolerated). Streaming
delegates to `accumulateStream`
([streaming.go:2130](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
using `llm.NewScanner`. A third path, `accumulateBedrockStream`
([streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go)),
decodes the AWS binary event-stream (`application/vnd.amazon.eventstream`)
returned by Bedrock's `-stream` actions — InvokeModel `chunk` frames wrap a
base64 Anthropic event, Converse frames carry text + a trailing usage block.
Cached / cache-creation buckets emit only when non-zero, preserving the existing
token schema.
### cost_meter
Reads `llm.provider` + `llm.model` + token buckets, looks up per-1k rate via
`pricing.Loader`, emits `cost.usd_total` or a closed-set `cost.skipped`
reason (`missing_provider/model/tokens`, `unparseable_tokens`, `zero_tokens`,
`unknown_model`). Loader's hot-reload goroutine is bound to proxy-lifetime
context via `startReloader`. **Key invariant:** provider-shape switch lives
in `pricing.Table.Cost` (sibling doc) — `cost_meter` stays provider-agnostic.
### llm_limit_record
Post-flight write. Always returns `DecisionAllow`; response has already been
served so RPC errors mustn't surface (logged at `Debugf`). Skip-on-no-signal
at line 81 (zero tokens + zero cost). **Key invariant:** the
skip-on-missing-attribution guard at line 98 is a safety net independent of
the framework's deny short-circuit — if the gate denied and the framework
still runs the recorder, the recorder skips on absent
`UserID`+`groupID`+`UserGroups` and no phantom counter materialises.
## Full-chain diagram (canonical order)
```mermaid
flowchart TD
A[HTTP request] --> B[llm_request_parser<br/>OnRequest]
B -->|llm.provider, llm.model,<br/>llm.stream, llm.request_prompt_raw| C[llm_router<br/>OnRequest]
C -->|llm.resolved_provider_id,<br/>llm.authorising_groups,<br/>upstream rewrite + auth| D[llm_limit_check<br/>OnRequest]
D -->|deny path| Z1[403 llm_policy.*]
D -->|allow + llm.selected_policy_id,<br/>llm.attribution_group_id,<br/>llm.attribution_window_seconds| E[llm_identity_inject<br/>OnRequest]
E -->|header strip+inject<br/>+ optional body rewrite| F[llm_guardrail<br/>OnRequest]
F -->|deny: model_blocked| Z2[403 llm_policy.model_blocked]
F -->|allow + llm.request_prompt| G[upstream LLM call]
G --> H[llm_response_parser<br/>OnResponse]
H -->|llm.{input,output,total,cached_input,cache_creation}_tokens,<br/>llm.response_completion| I[cost_meter<br/>OnResponse]
I -->|cost.usd_total or cost.skipped| J[llm_limit_record<br/>OnResponse]
J --> K[response to client]
```
## limit_check ⇒ limit_record record-once invariant
```mermaid
sequenceDiagram
participant LC as llm_limit_check
participant M as management gRPC
participant U as upstream LLM
participant LR as llm_limit_record
participant DB as sqlite consumption table
LC->>M: CheckLLMPolicyLimits (2s)
alt allow
M-->>LC: selected_policy_id, attribution_group_id, window_s
LC->>U: stamps attribution metadata
U-->>LR: response + tokens (via llm_response_parser + cost_meter)
LR->>M: RecordLLMUsage (5s, debug-on-error)
M->>DB: increment (user, group, window) row
else deny
M-->>LC: llm_policy.token_cap_exceeded
Note over LR: framework short-circuits; even if invoked,<br/>recorder skips on absent UserID+groupID+UserGroups
else mgmt nil / rpc error
LC-->>LC: allowNoAttribution() — fail open
Note over LR: no window_s ⇒ recorder books only account-level<br/>budget rules (which run independently)
end
```
The integration test
[agentnetwork_chain_integration_test.go](../../../proxy/internal/middleware/builtin/agentnetwork_chain_integration_test.go)
exercises all three branches against a real sqlite store + bufconn gRPC —
no mocks. Tests: `TestChain_AllowPath_StampsAttributionAndRecordsCounter`
(line 130), `TestChain_DenyPath_GateRejectsAndNoConsumptionWritten` (line
207), `TestChain_CapExhaustTransition` (line 265).
## Public contracts (per-middleware JSON config)
| Middleware | Config shape |
|---|---|
| `llm_request_parser` | `{provider_id?, redact_pii?, capture_prompt?: *bool}` ([factory.go:1937](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)) |
| `llm_router` | `{providers: [{id, models, upstream_scheme, upstream_host, upstream_path?, auth_header_name, auth_header_value, allowed_group_ids}]}` |
| `llm_limit_check` | `{}` — pulls `MgmtClient` from `FactoryContext` |
| `llm_identity_inject` | `{providers: [{provider_id, header_pair?|json_metadata?, extra_headers?}]}` |
| `llm_guardrail` | `{model_allowlist: []string, prompt_capture: {enabled, redact_pii}}` |
| `llm_response_parser` | `{redact_pii?, capture_completion?: *bool}` |
| `cost_meter` | `{pricing_path?}` (basename inside data-dir; defaults `pricing.yaml`) |
| `llm_limit_record` | `{}` — same pattern as `llm_limit_check` |
All factories accept empty / null / `{}` / whitespace as zero-value config;
only structurally invalid JSON is rejected so misconfig surfaces at chain
build time.
## Invariants
1. **limit_check ↔ limit_record paired.** They MUST appear together. Gate
stamps attribution metadata on the request leg; recorder reads it on the
response leg. If a chain contains only the recorder, the
skip-on-missing-attribution guard at
[llm_limit_record/middleware.go:8187, 98103](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go)
keeps counters consistent but no enforcement runs. Only-gate means
counters never tick and headroom appears infinite.
2. **`capture_prompt` / `capture_completion` pointer semantics.** Both are
`*bool`. `nil` = "preserve legacy emit" (back-compat default for
non-agent-network callers and pre-toggle tests). `false` = suppress the
key entirely (access-log row carries zero prompt / completion content).
`true` = emit. The synthesiser sets the pointer explicitly to the
account's `EnablePromptCollection` toggle. The handling lives
in [llm_request_parser/factory.go:5561](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)
and the symmetric [llm_response_parser/middleware.go:6268](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go);
a missing pointer must not be treated as `false` (that would suppress
capture for legacy non-agent-network callers).
`redact_pii` is an orthogonal `bool` controlling **form** of emitted
content, not whether it's emitted.
3. **`redact_pii` is parser-side.** Both parsers import
`llm_guardrail.RedactPII` and run it BEFORE stamping the metadata bag.
Load-bearing because the access-log sink reads `llm.request_prompt_raw`
and `llm.response_completion` directly — by the time `llm_guardrail`
runs its own pass on `llm.request_prompt`, the raw key has already been
stamped. Tests: `TestInvoke_RedactPii_RedactsBeforeEmittingRawPrompt`,
`TestInvoke_RedactPii_RedactsCompletionBeforeEmit`.
4. **Metadata allowlist enforcement.** Every middleware declares
`MetadataKeys()`. The framework accumulator drops any KV outside that
allowlist. When adding a new key, also extend the docstring in
`middleware/keys.go`.
5. **Closed deny-code set.** All deny paths emit one of:
`llm_policy.model_not_routable`, `llm_policy.no_authorised_provider`,
`llm_policy.model_blocked`, `llm_policy.token_cap_exceeded`,
`llm_policy.unmeterable_publisher` (path-routed Vertex publisher with no
parser → 403), `llm_policy.upstream_auth_failed` (GCP token mint failure →
502), or the management-supplied code on `llm_limit_check`. These surface
verbatim; arbitrary middleware text never reaches the wire.
## Things to scrutinise
**Correctness.** `llm_router` model match treats an empty `Models` slice as
"claim every model"
([middleware.go:238248](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
for gateway-style providers — confirm no real provider record ships with an
empty `Models` by accident. Path-prefix tie-break falls back to declaration
order when no candidate prefix-matches, so the synthesiser must emit a
deterministic order. `llm_limit_record` discards `strconv.ParseInt` errors
([middleware.go:7880](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go))
— relies on `llm_response_parser` always emitting parseable values; spot-check
the streaming partial path on truncated bodies.
**Security.** Auth headers must NEVER appear on `Mutations.HeadersAdd/Remove`
for the router — a direct headers path would bypass the framework gate. The
capture-pointer handling is the kind of place a bug ships PII to logs
silently; every synthesiser config path must set the pointer explicitly.
`llm_identity_inject` body inject silently skips on a
non-object `metadata` field
([middleware.go:262270](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
— header path still attributes, but body-level tag-budget enforcement
doesn't run for that request.
**Concurrency.** `cost_meter` shares a `pricing.Loader` via
`atomic.Pointer[Table]`; readers always see a consistent table. Every
middleware is a stateless value receiver. Integration test uses real bufconn
gRPC — race detector is the meaningful bar.
**Perf.** Hot path is `lookupKV` linear scan over <10 KVs; `cost_meter.Cost`
is O(1); SSE accumulation is single-pass. No map allocation per call.
**Observability.** Every deny stamps `llm_policy.decision=deny` and a
matching `llm_policy.reason` — access-log can pivot on either.
`llm_limit_record` only logs at `Debugf` on RPC failure
([middleware.go:125130](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go));
operators need an alternate signal (metric on `RecordLLMUsage` failures) for
counter accuracy.
## Test coverage
| File | Tests | Notes |
|---|---:|---|
| `all_test.go` | 1 | Registry surface lock |
| `agentnetwork_chain_integration_test.go` | 3 | Allow/deny/cap-exhaust vs live sqlite + bufconn gRPC |
| `llm_request_parser/middleware_test.go` | 18 | `provider_id` bypass, redaction, capture-pointer, rune-safe truncation |
| `llm_router/middleware_test.go` | 19 | Three-pass match, deny codes, path-prefix tie-break, header strip+inject |
| `llm_limit_check/middleware_test.go` | 6 | Allow/deny, fail-open on nil mgmt / RPC error, attribution stamping |
| `llm_identity_inject/middleware_test.go` | 28 | HeaderPair, JSONMetadata, ExtraHeaders, body inject, anti-spoof |
| `llm_guardrail/middleware_test.go` | 15 | Allowlist case-insensitivity, prompt capture toggle, deny shape |
| `llm_guardrail/redact_test.go` | 15 | Email, SSN, phone (E.164 + NA), bearer, IPv4; fixture-driven |
| `llm_response_parser/middleware_test.go` | 18 | Buffered OAI+Anthro, capture-pointer, redact, truncation |
| `llm_response_parser/streaming_test.go` | 7 | OAI usage frame, Anthro message_delta, truncated body best-effort |
| `cost_meter/middleware_test.go` | 17 | Each skip reason, provider-shape, pricing loader integration |
| `llm_limit_record/middleware_test.go` | 7 | Skip-on-no-signal, skip-on-missing-attribution, RPC failure swallowed |
## Cross-references
- Sibling: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — SDK adapters
+ SSE framer + pricing loader.
- Path-routed providers (Vertex AI + Bedrock), `keyfile::` credential, GCP
token minting, `/bedrock` prefix:
[50-path-routed-providers.md](./50-path-routed-providers.md).
- Upstream config: `management/server/agentnetwork/synthesizer` (out of scope).
- Framework: `proxy/internal/middleware/{chain,dispatcher,accumulator,registry}.go`.
- Metadata key registry: `proxy/internal/middleware/keys.go`.
- gRPC surface: `proto.ProxyServiceClient.{CheckLLMPolicyLimits,RecordLLMUsage}`.

View File

@@ -1,392 +0,0 @@
# proxy/llm-parsers — SDK adapters + pricing + SSE
The runtime-agnostic LLM library: the OpenAI Responses API (`/v1/responses`)
and the older Chat Completions API (`/v1/chat/completions`), the Anthropic
Messages API (`/v1/messages`), the SSE wire format (`event:` / `data:` lines,
`\n\n` framing, CRLF tolerance), and per-provider token accounting (OpenAI's
cached-prompt **subset** vs Anthropic's cache_read **additive** model). The
pricing table's per-provider cost formula is the highest-leverage place a
small bug would silently mis-bill operators.
Sibling module: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
— the 8 middlewares that consume this package's parsers + pricing loader.
---
## Module boundary
`proxy/internal/llm` is the runtime-agnostic LLM library shared by every
middleware that needs to understand provider-specific shapes. Zero
proxy-framework dependencies:
- `parser.go``Parser` interface, `Provider` enum, public factories
(`Parsers`, `DetectParser`, `ParserByName`).
- `openai.go` / `anthropic.go` / `bedrock.go` — per-provider `Parser` impls.
- `sse.go` — SSE scanner (`Scanner`, `Event`, `NewScanner`).
- `errors.go` — sentinels callers branch on with `errors.Is`.
- `pricing/` — embedded-default + hot-reload override table with
symlink-safe Unix loader (build-tagged stub elsewhere).
- `fixtures/` — captured request/response/stream bodies the tests replay.
The package carries zero proxy-framework dependencies so the same parsers can
be reused later by a WASM adapter
([parser.go:16](../../../proxy/internal/llm/parser.go)).
## Files
| File | LOC | Notes |
|---|---:|---|
| `parser.go` | 104 | Interface + factories + `Provider{Unknown,OpenAI,Anthropic}` enum |
| `openai.go` | 347 | Chat Completions + Completions + Responses API; cached_tokens subset |
| `openai_test.go` | 222 | 11 tests; fixture replay + cached/Responses-API matrix |
| `anthropic.go` | 172 | Messages + legacy `/v1/complete`; cache_read + cache_creation additive |
| `anthropic_test.go` | 154 | 7 tests including streaming-extraction-skipped contract |
| `bedrock.go` | 190 | AWS Bedrock InvokeModel (snake_case) + Converse (camelCase) response shapes; model lives in URL path |
| `bedrock_test.go` | — | InvokeModel + Converse usage shapes; AWS event-stream content-type → `ErrStreamingUnsupported` on buffered `ParseResponse` |
| `sse.go` | 117 | `bufio`-backed scanner; CRLF normalised; trailing-event handling |
| `sse_test.go` | 175 | 12 tests; fixture replay + multiline + size limits |
| `parser_test.go` | 53 | `Parsers()`, `DetectParser`, provider enum values |
| `errors.go` | 31 | 6 sentinels: `Err{Unknown,Unsupported}Provider/Model`, `Err{NotLLM,Malformed}Response`, `ErrStreamingUnsupported`, `ErrMalformedRequest` |
| `pricing/pricing.go` | 421 | `Loader`, `Table`, `Entry`; embedded defaults + atomic swap + mtime reload |
| `pricing/pricing_unix.go` | 69 | `O_NOFOLLOW` + fstat-from-FD + 1 MiB cap |
| `pricing/pricing_other.go` | 21 | Stub returning "not supported on this platform" |
| `pricing/pricing_test.go` | 432 | 21 tests — symlink rejection, reload race, path traversal, oversize |
| `pricing/defaults_pricing.yaml` | 85 | go:embed source of truth |
| `fixtures/*` | 2159 | OAI chat/responses/stream + Anthro messages/stream + pricing starter |
## Request body → parser dispatch
```mermaid
flowchart TD
A[HTTP request<br/>URL + JSON body] --> B{ParserByName?<br/>provider_id config set}
B -- yes --> P[matched Parser]
B -- no --> C[DetectParser]
C --> D{loop Parsers<br/>OpenAIParser, AnthropicParser}
D -- DetectFromURL match --> P
D -- no match --> X[ok=false<br/>middleware skips]
P --> E[ParseRequest body]
E -->|err: ErrMalformedRequest| Y[middleware emits provider only]
E --> F[RequestFacts<br/>model + stream]
P --> G[ExtractPrompt body]
G --> H[joinMessages<br/>extractContentParts<br/>decodeStringOrJoin]
H --> I[prompt text<br/>or empty]
F --> J[stamps llm.model + llm.stream]
I --> K[stamps llm.request_prompt_raw<br/>subject to capture_prompt gate]
```
OpenAI's URL hints
([openai.go:2733](../../../proxy/internal/llm/openai.go)) include
both `/v1/chat/completions` and the bare `/chat/completions` — the latter
covers Cloudflare AI Gateway, which rewrites the canonical version segment.
Anthropic's hints are `/v1/messages` and `/v1/complete`
([anthropic.go:1417](../../../proxy/internal/llm/anthropic.go)).
Both implementations use case-insensitive substring matching so a proxy prefix
strip / rewrite doesn't defeat detection.
`ParserByName` ([parser.go:93103](../../../proxy/internal/llm/parser.go))
is the **agent-network bypass**: the synthesiser knows which parser to use
because it built the synth service from the catalog, so it stamps
`provider_id` on the parser config and the middleware skips URL sniffing
entirely. This is what makes the same parser set work whether the request
flows to OpenAI direct, to LiteLLM, to Portkey, or to any gateway with a
non-canonical URL shape.
**Path-routed providers (Vertex AI, Bedrock) bypass both `ParserByName` and
`DetectParser`.** The model and the parser surface live in the URL path, so the
request middleware extracts them directly (`parseVertexPath` /
`parseBedrockPath`) before the parser-selection step. For Vertex the publisher
segment picks the parser (`anthropic` → Anthropic parser; `google`/Gemini →
none, request denied as unmeterable). For Bedrock the dedicated `BedrockParser`
handles the response. Full treatment in
[50-path-routed-providers.md](./50-path-routed-providers.md).
## Streaming response → SSE chunker → response parser → completion + token count
```mermaid
sequenceDiagram
participant U as upstream LLM
participant LR as llm_response_parser<br/>(OnResponse)
participant S as llm.NewScanner<br/>(SSE framer)
participant P as Parser-specific accumulator<br/>(accumulateOpenAIStream<br/>or accumulateAnthropicStream)
U-->>LR: text/event-stream<br/>(buffered prefix in RespBody)
LR->>S: NewScanner(bytes.NewReader(body))
loop until EOF or [DONE]
S-->>LR: Event{Type, Data}
LR->>P: dispatch per event.Type<br/>(OpenAI: data-only<br/>Anthropic: named events)
P-->>P: accumulate completion text<br/>track usage from final frame
end
P-->>LR: llm.Usage + completion string
LR->>LR: appendUsage stamps<br/>llm.{input,output,total,cached_input,cache_creation}_tokens
LR->>LR: truncateCompletion(3500 bytes, rune-safe)
LR->>LR: redactPII if redact_pii && captureCompletion
```
`Scanner.Next`
([sse.go:4487](../../../proxy/internal/llm/sse.go)) returns one
event per `\n\n` boundary; multiple `data:` lines join with `\n`; comment lines
(starting with `:`) are skipped per the SSE spec; a trailing event without a
closing blank line is still returned before `io.EOF` so a server that closes
the connection cleanly doesn't lose the last frame
([sse.go:5558](../../../proxy/internal/llm/sse.go)). CRLF is
normalised in `trimEOL` so fixtures captured from live servers replay
unchanged.
## Per-provider
### OpenAI
[openai.go:5467](../../../proxy/internal/llm/openai.go) defines
`openAIRequest` with three prompt fields: `messages` (Chat Completions),
`prompt` (legacy), `input` (Responses API). The decoder uses
`json.RawMessage` so each shape is parsed lazily.
`ParseResponse`
([openai.go:117146](../../../proxy/internal/llm/openai.go))
accepts both naming conventions: Chat Completions returns
`prompt_tokens`/`completion_tokens`, Responses API returns
`input_tokens`/`output_tokens`. `pickInt64` prefers Responses-API names and
falls back — same parser handles both endpoints without per-route config.
`openAICachedTokens` mirrors the fallback for
`input_tokens_details.cached_tokens` vs `prompt_tokens_details.cached_tokens`.
**Key invariant:** `CachedInputTokens` for OpenAI is a SUBSET of
`InputTokens`. The cost meter clamps to guard against malformed upstream
responses where `cached > total`.
### Anthropic
[anthropic.go:3749](../../../proxy/internal/llm/anthropic.go)
defines `anthropicRequest` covering Messages API (`system` + `messages[]`)
and legacy `/v1/complete` (`prompt` string). `ExtractPrompt` emits
`system: <text>` first when present, then per-message `role: content`.
`ParseResponse`
([anthropic.go:82104](../../../proxy/internal/llm/anthropic.go))
fills three independent token buckets: `InputTokens`, `CacheReadInputTokens`,
`CacheCreationInputTokens`. Latter two are **additive** (not subset).
`TotalTokens` sums all four so downstream dashboards render one "tokens"
number without double-counting.
`ExtractCompletion` walks `content[]` `{type, text}` parts and concatenates
non-empty text with newlines, falling back to legacy `completion`.
### Bedrock
[bedrock.go](../../../proxy/internal/llm/bedrock.go) implements the
`Parser` interface for the AWS Bedrock runtime. Bedrock is **path-routed**: the
model lives in the URL (`/model/{id}/{action}`), so the request middleware
extracts it (see [50-path-routed-providers.md](./50-path-routed-providers.md))
and `ParseRequest` is a deliberate no-op. The parser's real work is on the
response leg, covering both Bedrock body shapes:
- **InvokeModel** — vendor-native. Anthropic-on-Bedrock returns snake_case usage
(`input_tokens`, `output_tokens`, `cache_read_input_tokens`,
`cache_creation_input_tokens`) with the same additive cache buckets as
first-party Anthropic.
- **Converse** — unified camelCase (`inputTokens`, `outputTokens`,
`totalTokens`). `firstNonZero` folds the two naming conventions into one
`Usage`; when Converse omits `totalTokens` the parser sums the buckets.
`ProviderName()` returns `"bedrock"` — its own `defaults_pricing.yaml` block,
keyed by the **normalised** model id (region prefix + version suffix stripped by
the request parser). `ParseResponse` returns `ErrStreamingUnsupported` for an
AWS binary event-stream content-type (`application/vnd.amazon.eventstream`,
`isAWSEventStream`) so the caller routes to the streaming accumulator instead.
### SSE framing
`Scanner` is `bufio`-backed, 64 KiB read buffer, 1 MiB max line so a
malicious upstream can't blow process memory
([sse.go:3338, 97100](../../../proxy/internal/llm/sse.go)).
`splitField` strips one space after the `:` per the SSE spec. Documented
`not safe for concurrent use`; every consumer creates a fresh scanner per
response body. Streaming accumulators live in the middleware package
([llm_response_parser/streaming.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
but use `llm.NewScanner` so the framing contract stays here.
### Pricing catalog
`Table.Cost`
([pricing.go:129174](../../../proxy/internal/llm/pricing/pricing.go))
is the cost formula — most security-relevant math in this module:
| Provider | Formula |
|---|---|
| `openai` | `(inTokens clamped) × InputPer1K + clamped × CachedInputPer1K + outTokens × OutputPer1K` where `clamped = min(cachedInput, inTokens)` |
| `anthropic`, `bedrock` | `inTokens × InputPer1K + cachedInput × CacheReadPer1K + cacheCreation × CacheCreationPer1K + outTokens × OutputPer1K` |
| default | `inTokens × InputPer1K + outTokens × OutputPer1K` |
`bedrock` shares the Anthropic additive-cache formula
([pricing.go:172-174](../../../proxy/internal/llm/pricing/pricing.go)):
Anthropic-on-Bedrock reports the same additive cache buckets, while non-Anthropic
Bedrock models (Nova, Llama) simply report zero in those buckets so cost reduces
to `input + output`.
Each per-bucket rate falls back to `InputPer1K` when zero — operators opt in
to discounts by setting the field.
`Loader`
([pricing.go:212268](../../../proxy/internal/llm/pricing/pricing.go))
overlays an optional `pricing.yaml` from data-dir on top of the go:embed
defaults. Atomic pointer swap means readers never observe a partial update.
The mtime-poll reloader (30s default cadence) keeps the previous table on
parse failure so cost annotation never goes blank during a botched edit.
`defaults_pricing.yaml` is the source of truth for built-in pricing.
Operator overrides only carry the entries they want to change.
## Public contracts
**`Parser` interface**
([parser.go:5066](../../../proxy/internal/llm/parser.go)):
```go
type Parser interface {
Provider() Provider
ProviderName() string
DetectFromURL(path string) bool
ParseRequest(body []byte) (RequestFacts, error)
ParseResponse(status int, contentType string, body []byte) (Usage, error)
ExtractPrompt(body []byte) string
ExtractCompletion(status int, contentType string, body []byte) string
}
```
Adding a provider means implementing this interface and appending to the
slice returned by `Parsers()` ([parser.go:7884](../../../proxy/internal/llm/parser.go)).
Order matters: `DetectFromURL` ties resolve by registration order.
`Parsers()` today returns `{OpenAIParser, AnthropicParser, BedrockParser}`.
**`Provider` enum**
([parser.go:818](../../../proxy/internal/llm/parser.go)):
`ProviderUnknown = 0`, `ProviderOpenAI = 1`, `ProviderAnthropic = 2`,
`ProviderBedrock = 3`. Numeric values are persisted in nothing today but treat
them as wire-stable — new providers must take fresh numbers.
**`Pricing` lookup**
([pricing.go:129](../../../proxy/internal/llm/pricing/pricing.go)):
```go
func (t *Table) Cost(provider, model string, inTokens, outTokens, cachedInput, cacheCreation int64) (float64, bool)
```
Nil-safe: `t.Cost` on a nil receiver returns `(0, false)`
([pricing.go:130132](../../../proxy/internal/llm/pricing/pricing.go)).
`ok=false` means provider or model is absent from the loaded table; the caller
emits `cost.skipped=unknown_model`.
## Invariants
1. **Cross-platform pricing build.** `pricing_unix.go` carries the only
functional `loadPricing` (uses `syscall.O_NOFOLLOW` and `f.Stat()` on an
open descriptor — both Unix-only). `pricing_other.go` is a build-tag
fallback that returns `"not supported on this platform"`
([pricing_other.go:1416](../../../proxy/internal/llm/pricing/pricing_other.go)).
The proxy is Linux-only in production today; a Windows port needs an
equivalent path-as-handle implementation. Reviewers building on Windows
should expect this surface to return an error at startup if an override
file is configured.
2. **SSE scanner handles partial chunks.** A buffered prefix that doesn't end
in `\n\n` still yields its accumulated event before `io.EOF`
([sse.go:5558](../../../proxy/internal/llm/sse.go)). Tests:
`TestSSEScanner_OpenAIFixture`, `TestSSEScanner_AnthropicFixture`,
`TestSSEScanner_MultilineData`, `TestSSEScanner_CRLF`. The streaming
accumulators ride on this: `accumulateAnthropicStream` and
`accumulateOpenAIStream` `break` on any scanner error to return partial
usage rather than aborting
([streaming.go:6873, 144150](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go)).
3. **`defaults_pricing.yaml` is the source of truth.** Compiled into the
binary via `//go:embed`
([pricing.go:2930](../../../proxy/internal/llm/pricing/pricing.go)).
`DefaultTable()` parses once and panics on parse failure
([pricing.go:4249](../../../proxy/internal/llm/pricing/pricing.go))
— by design: a broken embedded YAML must not ship to production.
4. **Loader path validation.** `resolveMiddlewareDataPath`
([pricing.go:370394](../../../proxy/internal/llm/pricing/pricing.go))
rejects absolute paths, traversal segments, and basenames that fail
`basenameRegex = ^[a-zA-Z0-9._-]+$`. The resolved path must remain
inside `baseDir` even after `filepath.Clean`. Tests:
`TestNewLoader_PathValidation`, `TestNewLoader_PathValidation_Extended`,
`TestNewLoader_SymlinkOutsideBaseDirRejected`, `TestNewLoader_SymlinkRejected`.
5. **Unix loader symlink safety.** `O_NOFOLLOW` on open, `f.Stat()` on the
open descriptor (never re-stat by path), `info.Mode().IsRegular()` check,
`io.LimitReader(f, maxPricingBytes+1)` with a final size assertion
([pricing_unix.go:2557](../../../proxy/internal/llm/pricing/pricing_unix.go)).
A mid-read symlink swap is detected because the fstat is on the original
fd. Test: `TestNewLoader_RejectsOversizedFile_FixesM4`.
6. **`yaml.NewDecoder(...).KnownFields(true)`**
([pricing.go:397398](../../../proxy/internal/llm/pricing/pricing.go))
rejects YAML files that carry fields not in the schema. A typo in an
operator override file fails loud instead of silently zeroing rates.
## Things to scrutinise
**Correctness.** Verify OpenAI cached-prompt clamp at
[pricing.go:147149](../../../proxy/internal/llm/pricing/pricing.go)
short-circuits before subtraction. `Anthropic.TotalTokens` sums all four
buckets (in + out + cache_read + cache_creation) — downstream dashboards
need to know this differs from `input + output`.
`OpenAIParser.ExtractPrompt` falls through `messages → input → prompt`; a
request sending all three reports only `messages` (uncommon but worth
noting).
**Security.** `Scanner.maxLine = 1 MiB`; a 2 MiB single-line `data:` event
errors from `Scanner.Next` and both accumulators stop with partial usage.
Pricing file 1 MiB cap is orders of magnitude larger than realistic. Confirm
new schema additions are mirrored in both `pricingFile` and `Entry`;
`KnownFields(true)` will reject silently-typo'd operator overrides
otherwise.
**Concurrency.** `Loader.table` is `atomic.Pointer[Table]`; readers never
block or see a torn table. `Loader.Reload` is one goroutine, cancelled via
context (`TestLoader_ReloadBackgroundLoopCancellation`). `DefaultTable()`
uses `sync.Once`. Per-call `Scanner` instances mean no shared state across
concurrent response-parser calls.
**Perf.** `Table.Cost` is two map lookups + multiplications, O(1).
`Scanner.Next` is one `ReadString('\n')` per line. Pricing reload poll 30s.
**Observability.** Reload failures count via `metric.Int64Counter` keyed
`plugin`; warning log rate-limited at 5 min so a broken file doesn't flood.
Parser errors return sentinels — middleware uses `errors.Is` to map to the
right `cost.skipped` reason.
## Test coverage
| File | Tests | Coverage highlights |
|---|---:|---|
| `parser_test.go` | 3 | `Parsers()` shape lock, `DetectParser` URL matrix, provider enum stability |
| `openai_test.go` | 11 | Chat Completions + Responses API + legacy `prompt`; cached-tokens subset for both naming conventions; fixture replays |
| `anthropic_test.go` | 7 | Messages + legacy `/v1/complete`; streaming REJECTED on `ParseResponse` (must use scanner); fixture replays |
| `sse_test.go` | 12 | Fixture replay both providers; multiline `data:`; CRLF; comment skip; trailing-event-without-blank-line; oversize rejection |
| `pricing/pricing_test.go` | 21 | Provider-shape switch; cached-rate fallback; cached-clamp; symlink rejection (target outside basedir + symlink to file); path validation matrix; oversize rejection; reload-keeps-previous-on-parse-error; mtime change detection; goroutine cancellation |
**Fixtures** ([proxy/internal/llm/fixtures/](../../../proxy/internal/llm/fixtures/)):
`openai_chat_completion.json` (chat.completions with usage),
`openai_responses.json` (Responses API shape),
`openai_stream.txt` (3 deltas + usage + `[DONE]`),
`anthropic_messages.json` (Messages API non-streaming),
`anthropic_stream.txt` (full 7-event sequence: message_start →
content_block_{start,delta×2,stop} → message_delta (usage) → message_stop),
`pricing.yaml` (realistic-pricing starter for operator overrides).
## Cross-references
- Sibling: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
— the chain that calls `llm.Parsers()`, `llm.ParserByName`,
`llm.NewScanner`, `pricing.NewLoader`.
- Path-routed providers (Vertex AI + Bedrock), credential syntax, and the
Bedrock AWS event-stream accumulator:
[50-path-routed-providers.md](./50-path-routed-providers.md).
- Direct callers: `llm_request_parser/middleware.go:8294`,
`llm_response_parser/middleware.go:113123`,
`llm_response_parser/streaming.go:65, 142`, `cost_meter/factory.go:4957`.
- Related elsewhere: the agent-network synthesiser stamping `provider_id`
is covered in the management-side module guide; proxy server boot +
`FactoryContext` construction is covered in the proxy-framework guide.

View File

@@ -1,194 +0,0 @@
# proxy/runtime — translate + serve + log
> **Risk level:** High — every config push from management is translated here, and the chain runs on every HTTP request to a synth target.
> **Backward-compat impact:** Additive at the wire (`PathTargetOptions.middlewares`, `agent_network`, `disable_access_log`, capture caps) and on the proxy `Server` struct (`MiddlewareDataDir`, `MiddlewareCaptureBudgetBytes`). Non-agent-network targets stay on the no-middleware fast path.
## Module boundary
Turns the synth-service wire format from `ProxyService.SyncMappings`/`GetMappingUpdate` into in-process middleware chains and runs them on top of the existing `httputil.ReverseProxy`. Four concerns: (a) **translate**`proto.MiddlewareConfig` → validated `middleware.Spec` (proxy/middleware_translate.go) + self-register the eight built-ins (proxy/middleware_register.go); (b) **boot + rebuild** — construct the `middleware.Manager`, share the OTel meter, install the live-service check, rebuild per-path chains on every `addMapping`/`modifyMapping` (proxy/server.go); (c) **serve** — resolve chain at request time, capture bodies under a global budget, invoke `RunRequest`/`RunResponse`/`RunTerminal`, render deny responses, apply `UpstreamRewrite` (proxy/internal/proxy/reverseproxy.go); (d) **log + tag** — emit access-log entries with the new `agent_network` flag, gate emission on `EnableLogCollection` via `DisableAccessLog` (proxy/internal/accesslog).
**Inert for non-agent-network targets**: nil or empty chain → existing fast path (reverseproxy.go:127-139); `SuppressAccessLog` defaults false so the access-log middleware emits unchanged.
## Files
| Path | Role |
| ---- | ---- |
| proxy/middleware_translate.go | proto→Spec translation; slot/failmode/timeout mapping; caps |
| proxy/middleware_translate_test.go | translator unit tests |
| proxy/middleware_register.go | blank-imports the eight builtins for `init()` registration |
| proxy/server.go | `initMiddlewareManager`, `rebuildMiddlewareChains`, `isLiveService`, `buildMiddlewareBindings`, new Server fields, `protoToMapping` stamps AgentNetwork/DisableAccessLog/CaptureConfig/Middlewares |
| proxy/internal/proxy/reverseproxy.go | `WithMiddlewareManager`, chain dispatch, body capture, `applyUpstreamRewrite`/`Headers`, `buildRequestInput`, response-leg respInput identity fields |
| proxy/internal/proxy/reverseproxy_test.go | `TestBuildRequestInput_PropagatesIdentityAndGroups` |
| proxy/internal/proxy/context.go | `agentNetwork`, `suppressAccessLog`, `userGroupNames` on `CapturedData` |
| proxy/internal/proxy/servicemapping.go | new `PathTarget` fields |
| proxy/internal/proxy/agent_network_chain_realstack_test.go | end-to-end self-contained chain test |
| proxy/internal/accesslog/logger.go | `logEntry.AgentNetwork``proto.AccessLog` |
| proxy/internal/accesslog/middleware.go | reads `GetAgentNetwork()`; gates `l.log` on `!GetSuppressAccessLog()` |
| proxy/internal/accesslog/middleware_test.go | suppress/default/preserves-usage assertions |
| proxy/internal/auth/middleware_test.go | tunnel-peer group propagation contract |
| proxy/internal/metrics/metrics.go | `Meter()` getter for the middleware manager |
## Architecture & flow
### Synth-service ingestion → translate → register → serve
```mermaid
flowchart TD
A[Management SyncMappings/GetMappingUpdate] --> B["processMappings\nserver.go:1492"]
B --> C{Mapping type}
C -->|CREATED| D["addMapping → setupHTTPMapping → updateMapping"]
C -->|MODIFIED| E["modifyMapping → cleanupMappingRoutes → setupHTTPMapping → updateMapping"]
C -->|REMOVED| F["removeMapping → cleanupMappingRoutes → invalidateMiddlewareChains"]
D --> G["protoToMapping\nserver.go:2181"]
E --> G
G --> H["translateMiddlewareConfigs\nmiddleware_translate.go:55"]
G --> I["translateMiddlewareCaptureConfig\nmiddleware_translate.go:18"]
H --> J["[]middleware.Spec on PathTarget"]
I --> K["*bodytap.Config on PathTarget"]
J --> L["proxy.AddMapping\nservicemapping.go:118"]
K --> L
L --> M["rebuildMiddlewareChains\nserver.go:2017 → Manager.Rebuild"]
F --> N["Manager.Invalidate(serviceID)"]
```
### Per-request lifecycle through the chain + accesslog
```mermaid
sequenceDiagram
autonumber
participant C as Client
participant M as accesslog.Middleware
participant A as auth.Middleware (Protect)
participant RP as ReverseProxy.ServeHTTP
participant CH as middleware.Chain
participant U as Upstream
C->>M: HTTP request
M->>M: NewCapturedData(requestID), WithCapturedData(ctx)
M->>A: next.ServeHTTP
A->>A: Private → ValidateTunnelPeer → stamp UserID/Email/Groups/GroupNames/AuthMethod
A->>RP: next.ServeHTTP
RP->>RP: findTargetForRequest → targetResult
RP->>RP: stamp ServiceID/AccountID/AgentNetwork/SuppressAccessLog on CapturedData
RP->>RP: resolveChain via Manager.ChainFor
alt chain == nil or Empty
RP->>U: httputil.ReverseProxy.ServeHTTP (fast path)
else chain non-empty
RP->>RP: bodytap.CaptureRequest (global budget)
RP->>CH: RunRequest
CH-->>RP: denyOutput? requestMeta + upstreamRewrite
alt deny
RP->>C: RenderDenyResponse
else allow
RP->>RP: capturingWriter + applyUpstreamRewrite/Headers
RP->>U: httputil.ReverseProxy.ServeHTTP(respWriter)
U-->>RP: response
RP->>CH: RunResponse (respInput carries UserGroups)
RP->>CH: RunTerminal (merged request+response metadata)
end
end
RP-->>M: handler returns
M->>M: build logEntry incl. AgentNetwork
alt SuppressAccessLog == true
M->>M: skip l.log; still trackUsage
else default
M->>M: l.log → goroutine SendAccessLog
end
```
### EnableLogCollection suppression path
```mermaid
flowchart LR
S["agentnetwork.Settings.EnableLogCollection"] --> B["synthesizer: target.DisableAccessLog = !EnableLogCollection"]
B --> P["proto PathTargetOptions.disable_access_log (field 13)"]
P --> T["protoToMapping reads GetDisableAccessLog()\nserver.go:2211"]
T --> M["PathTarget.DisableAccessLog\nservicemapping.go:47"]
M --> R["ServeHTTP: cd.SetSuppressAccessLog\nreverseproxy.go:106"]
R --> G["accesslog middleware: if !GetSuppressAccessLog l.log\nmiddleware.go:95"]
R --> U["trackUsage unconditional — bandwidth telemetry preserved"]
```
**Ingestion** lands as a `ProxyMapping` batch on `handleSyncMappingsStream`/`handleMappingStream`. `processMappings` dispatches to `addMapping`/`modifyMapping`/`removeMapping`; HTTP goes `setupHTTPMapping → updateMapping → protoToMapping`. `protoToMapping` (server.go:2181) is the single translation surface that materialises `[]middleware.Spec`, `*bodytap.Config`, `AgentNetwork`, `DisableAccessLog` onto each `PathTarget`; `updateMapping` finishes with `s.proxy.AddMapping(m)` (atomic swap under `mappingsMux`) and `s.rebuildMiddlewareChains(svcID, m)`.
At **request time** the access-log middleware stamps `CapturedData`; the auth chain runs (Private services lift `peer_group_ids` from `ValidateTunnelPeer` — auth/middleware_test.go:322). `ReverseProxy.ServeHTTP` resolves the chain; nil or empty → original `httputil.ReverseProxy`, no body capture. When a chain matches, body is captured under the global budget, `RunRequest` produces an `UpstreamRewrite` (`llm_router` selects a provider, rewrites scheme/host/path, injects `Authorization`), and `RunResponse`+`RunTerminal` run after the upstream returns. The terminal slot sees the merged metadata bag — that's how `llm_limit_record` ships the consumption sample. The **access-log** addition: `logEntry.AgentNetwork` from `GetAgentNetwork()` onto `proto.AccessLog.AgentNetwork`; the gate at middleware.go:95 honors `EnableLogCollection`, skipping `l.log` but keeping `trackUsage` so bandwidth telemetry survives.
## Public contracts touched
- `proxy.Server.MiddlewareDataDir` (string) — base dir for file-backed middleware config (server.go:238-241).
- `proxy.Server.MiddlewareCaptureBudgetBytes` (int64) — process-wide capture cap; defaults to 256 MiB (server.go:248-250).
- `proxy/internal/proxy.WithMiddlewareManager(*middleware.Manager) Option` — new option on `NewReverseProxy`; nil keeps the fast path (reverseproxy.go:48-56).
- `proxy/internal/proxy.PathTarget` adds `Middlewares`, `CaptureConfig`, `AgentNetwork`, `DisableAccessLog` (servicemapping.go:27-51), all zero-default.
- `proxy/internal/proxy.CapturedData` adds `agentNetwork`, `suppressAccessLog`, `userGroupNames` behind `sync.RWMutex`; slices deep-copied (context.go:47-66, 183-258).
- `accesslog.logEntry.AgentNetwork` + `proto.AccessLog.AgentNetwork` (logger.go:131, 268).
- `metrics.Metrics.Meter()` exposes the OTel meter for the middleware manager (metrics.go:53-58).
## Invariants
- **Synth-service updates are live (no proxy restart).** Every `MODIFIED` flows through `modifyMapping → cleanupMappingRoutes` (invalidates chains) `→ setupHTTPMapping → updateMapping → rebuildMiddlewareChains`. **ProxyMapping.Private preservation:** the relevant logic lives in `management/internals/shared/grpc/proxy.go:shallowCloneMapping`, not this module, but it surfaces here — if a `MODIFIED` synth service arrives `private=false`, auth skips `ValidateTunnelPeer`, `CapturedData.UserGroups` stays empty, and `llm_router` denies with `llm_policy.no_authorised_provider` until a management restart re-pushes the snapshot. This module assumes `mapping.GetPrivate()` is correct on every batch.
- **`EnableLogCollection=false` suppresses access-log writes but middleware still runs.** Gate is one `if !cd.GetSuppressAccessLog()` immediately around `l.log(entry)` (middleware.go:95); `trackUsage` runs below the gate. Locked by `TestMiddleware_SuppressAccessLog_PreservesUsageTracking` (middleware_test.go:139).
- **`agent_network` flag on access-log entries is set when the chain processed the request.** Source `target.AgentNetwork`, stamped at reverseproxy.go:105, read at accesslog/middleware.go:86.
- **auth → builtin group propagation.** `Protect` writes `UserGroups`/`UserGroupNames`; `buildRequestInput` (reverseproxy.go:333) copies them into `middleware.Input`. The response-leg `respInput` (reverseproxy.go:196-223) also carries `UserEmail`/`UserGroups`/`UserGroupNames``llm_limit_record` needs `UserGroups` to ship `group_ids` so management's group-targeted budget rules match (comment at reverseproxy.go:211-215).
- **Empty chains stay on the fast path.** `ServeHTTP` skips body capture and the run sequence when `chain == nil || chain.Empty()` (reverseproxy.go:127).
- **Self-registration is the only way a builtin reaches the registry.** `middleware_register.go` blank-imports each builtin; `init()` adds the factory to `mwbuiltin.DefaultRegistry()`. Missing it → translator drops the entry with a warn (translate.go:97).
## Things to scrutinize
### Correctness
- **Translate edge cases** — drops on nil cfg, empty ID, unknown ID, UNSPECIFIED slot; each logs one warn; volume bounded by `MaxMiddlewaresPerChain`.
- **Re-translate without dropping in-flight requests** — `Manager.Rebuild` is the only call from `rebuildMiddlewareChains`. Reverse proxy reads `ChainFor` once per request (reverseproxy.go:327) and runs the captured `*Chain` for the whole request. Verify in module 30 that `Rebuild` swaps atomically.
- **ProxyMapping.Private preservation** — enforced management-side in `shallowCloneMapping`. Proxy-side regression catches: `TestProtect_PrivateService_TunnelPeerGroupsPropagate` + the integration test.
- **Body-capture cleanup** — `defer releaseBudget()` (reverseproxy.go:145) and `defer capturingWriter.Release()` (reverseproxy.go:180) must run on every return; confirm no future `return` lands between acquisition and defer.
- **`applyUpstreamRewrite` clones the URL** — `cloned := *orig` value-copies `*url.URL`; safe because overwritten fields are strings, not slices/maps (reverseproxy.go:285-292).
### Security
- **Translate validates every config** — registry membership rejects unknown IDs; UNSPECIFIED slot drops; ID-less drops; raw config copied (not aliased) at translate.go:109.
- **`AuthHeader`/`StripHeaders` only reachable via `UpstreamRewrite`** — regular mutation surface goes through the framework denylist (`Authorization`/`Cookie` blocked); only the router middleware can replace `Authorization` (reverseproxy.go:296-304). Confirm in module 30 nothing outside the proxy-trusted path populates `UpstreamRewrite.AuthHeader`.
- **`stampNetBirdIdentity` strips client-sent values first** (reverseproxy.go:742-743) — anti-spoof for `X-NetBird-User`/`X-NetBird-Groups`; control chars filtered; comma-bearing labels dropped (reverseproxy_test.go:1217/:1243/:1193).
- **Auth → group propagation** — `auth/middleware_test.go:322` and `:366` cover the contract. If auth ever stops calling `ValidateTunnelPeer` for Private services, every agent-network request silently denies.
### Concurrency
- **Chain replacement under in-flight requests** — `findTargetForRequest` takes `mappingsMux.RLock`; `AddMapping` writes. `resolveChain` calls `ChainFor` once; even if `Rebuild` swaps mid-request, in-flight requests keep running on the captured pointer.
- **`CapturedData` mutation across slots** — accessors take `sync.RWMutex`; slices deep-copied on both Set and Get. Verify no caller mutates the returned slice expecting it to land back.
- **`Manager.Invalidate` race** — `removeMapping` invalidates after `cleanupMappingRoutes`; mapping read happens before chain resolution, so requests before invalidate run captured chains; later ones fail `findTargetForRequest`.
- **`Logger.log` goroutine** — `logSem` caps at `maxLogWorkers = 4096`; overflow → `dropped.Add(1)` + debug log. Middleware test uses a buffered channel and 150ms negative-assertion window — review whether 150ms holds on slow CI.
### Backward compatibility
- **Non-agent-network services unaffected** — `protoToMapping` reads new fields only when `opts != nil`; defaults leave `Middlewares`/`CaptureConfig` nil → chain resolves nil → fast path. Existing `reverseproxy_test.go` (non-chain) still passes.
- **`disable_access_log` is proto field 13, default false** — every existing target unset; gate is no-op. Locked by `TestMiddleware_SuppressAccessLog_DefaultEmitsLog` (middleware_test.go:104).
- **`Server` additions optional** — 256 MiB default when `MiddlewareCaptureBudgetBytes ≤ 0` (server.go:1997-2000).
### Performance
- **Translate cost per push** — O(n) with per-entry registry lookup and `config_json` copy; negligible vs. the upstream gRPC unmarshal.
- **Empty-chain hot path** — one `ChainFor` map lookup + one `chain.Empty()` check; no allocation delta vs. pre-PR.
- **Body capture buffer churn** — `bodytap.CaptureRequest` allocates `MaxRequestBytes` per chain-hitting request; `releaseBudget` ties allocation to the 256 MiB proxy-wide budget. Confirm in module 30 the budget is a hard cap.
### Observability
- **Metrics** — `Metrics.Meter()` shared with `middleware.NewMetrics` (server.go:1990-1993) so middleware instruments land in the same prometheus exporter. No new metrics defined here.
- **Access-log accuracy** — every entry carries `AgentNetwork`; terminal-slot metadata merged into `CapturedData.Metadata` (reverseproxy.go:238-241).
- **Deny logs at `Infof`** (reverseproxy.go:170) — review whether `Info` is too noisy at high deny rates; consider Debug or rate-limit.
## Test coverage
| Test file | Locks down |
| --------- | ---------- |
| proxy/middleware_translate_test.go | Empty/nil → nil; field preservation; unknown ID skip; nil registry permissive; timeout clamping; fail-mode + slot incl. UNSPECIFIED-drop; empty-ID drop; truncation above + at `MaxMiddlewaresPerChain` |
| proxy/internal/proxy/reverseproxy_test.go | Rewrite host/headers/cookies/query; trusted proxy; path forwarding; classifyProxyError; X-NetBird-User/Groups anti-spoof + CSV-join + control-char/comma rejection + fallback-to-ID; `TestBuildRequestInput_PropagatesIdentityAndGroups` (UserGroups/Email/GroupNames/AgentNetwork reach `middleware.Input`) |
| proxy/internal/proxy/agent_network_chain_realstack_test.go | **The end-to-end integration test.** Drives a real agent-network request through `ReverseProxy.ServeHTTP` with the chain the synthesizer produces, against an in-process management gRPC (bufconn) backed by a real sqlite store + real `agentnetwork.Manager`, plus an `httptest` upstream — no external infrastructure or real LLM. Guarantees: (1) response-leg `respInput` carries `UserGroups` so `llm_limit_record` ships non-empty `group_ids` and the admin-group consumption row increments; (2) `RedactPii=true` redacts both prompt and completion on captured metadata; (3) the full chain runs against a real management stack. **Line 189-211 inlines the proto→Spec mapping** instead of calling the proxy's private `translateMiddlewareConfig` — keep that inline mirror in sync with `proxy/middleware_translate.go` or the test silently diverges from production. |
| proxy/internal/accesslog/middleware_test.go | `SuppressAccessLog=true` skips `SendAccessLog` (150ms negative wait); default emits one send (2s positive); usage tracking runs under suppression |
| proxy/internal/auth/middleware_test.go | `TestProtect_PrivateService_TunnelPeerGroupsPropagate` proves `peer_group_ids` reach `CapturedData.UserGroups`; `TestProtect_PrivateService_TunnelPeerDenied` proves rejected peers 403 without reaching the handler |
The integration test runs in a few seconds with no external infrastructure — exercising the real synthesizer, `Manager.Rebuild`, `ServeHTTP` dispatch, and `llm_limit_record` writing a real consumption row through the real `agentnetwork.Manager` over real gRPC.
## Known limitations / explicit non-goals
- **Translator does not validate `RawConfig` JSON** — factory's job at `New([]byte)`. Confirm in module 30 that a per-binding factory failure doesn't poison the rest of the chain.
- **No throttle on management push rate** — every `MODIFIED` triggers `Manager.Rebuild`. Mitigation upstream.
- **Streaming responses (SSE)** — body capture is streaming-aware, but response-leg middleware runs only after the response completes; long SSE streams delay `llm_limit_record` until close.
- **OIDC-only path doesn't carry tunnel-peer groups** — agent-network synth services rely on the Private tunnel-peer path; JWT groups claim is the only carrier for non-Private OIDC.
- **`agent_network` flag on L4 entries** not added; HTTP-only.
- **`mw.capture.bypass_reason` metadata key** documented at reverseproxy.go:151,184; namespace this in module 30/31 to avoid collisions.
## Cross-references
- Upstream: [shared/api](10-shared-api.md), [proxy/middleware-framework](30-proxy-middleware-framework.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), [proxy/llm-parsers](32-proxy-llm-parsers.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)

View File

@@ -1,228 +0,0 @@
# dashboard — UI for agent-networks
This module documents code that lives in the **dashboard repo** (under
`src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`), not
in this repo. It is co-located here so backend readers see the full picture.
> **Risk level:** Medium. The new surface is isolated under `src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`, but it also reshapes the sidebar, splits `/peers`, renames `reverse-proxy/clusters` → `self-hosted-proxies`, and overlays the Control Center graph. Regressions here would be cross-cutting.
> **Backward-compat impact:** Additive on the API side. Breaking on URL/navigation: `/peers` redirects to `/peers/devices` (src/app/(dashboard)/peers/page.tsx:7-15), `/reverse-proxy/clusters` was renamed to `/reverse-proxy/self-hosted-proxies`, the sidebar lost Access Control / Networks / Reverse Proxy / DNS / standalone Guardrails / Consumption / Activity (Navigation.tsx:165-171 — routes still resolve via URL), and the standalone `/agent-network/{access-log,consumption,global-controls}` routes are gone in favor of `/agent-network/observability`.
## Module boundary
The dashboard is the only place an operator interacts with agent-networks: provider catalog, configured providers, policies, guardrails, account-level budget rules, account settings (collection / redaction toggles), per-request access log, and consumption rollups all render, paginate, and edit here. Data flows in via SWR (`useFetchApi`) keyed by REST URL. One big context provider (`src/modules/agent-network/AIProvidersProvider.tsx`) aggregates five resources (providers, policies, guardrails, budget rules, settings) plus the proxy access-log stream filtered to `agent_network=true`, and exposes `add* / update* / toggle* / delete*` mutators that call through `useApiCall` and re-`mutate()` SWR. Pages mount the provider once at the top and compose presentational tables and modals beneath. The control-center page additionally fetches `/agent-network/{providers,policies}` directly (control-center/page.tsx:123-130) to overlay graph nodes.
## What the UI delivers
- **AI Observability** page with four tabs: Access Logs, Budget Dashboard,
Budget Settings, Log Settings (replaces the standalone access-log,
consumption, and global-controls routes).
- **Providers** page: provider catalog + connect/edit wizard with per-vendor
copy (LiteLLM, Portkey, Bifrost, Cloudflare, Vercel, OpenRouter, custom).
- **Policies** page: group → provider authorization with per-policy Limits
(minute-granular windows) + guardrail attach.
- **Guardrails** page: reusable model-allowlist + prompt-capture sets.
- **Account controls**: Log Collection / Prompt Collection / Redact PII toggles.
- **Budget rules**: account-level rules reusing the policy Limits UI.
- **Control Center overlay**: provider + agent-policy nodes on the graph.
- **Navigation + peers reshaping**: peers split into Devices / Agents,
`reverse-proxy/clusters` renamed to `self-hosted-proxies`, sidebar
repackaged for agent-network focus.
## Surface added
### New pages
| Route | Purpose | Backing module(s) |
| ----- | ------- | ----------------- |
| `/agent-network` | Redirect to `/agent-network/providers` | page.tsx:7-15 |
| `/agent-network/providers` | List + connect providers; header surfaces per-account base URL | providers/page.tsx + AgentProvidersTable + AIProviderModal |
| `/agent-network/policies` | Group → Provider authorization with per-policy Limits + Guardrail attach | policies/page.tsx + AgentPoliciesTable + AgentPolicyModal |
| `/agent-network/guardrails` | Reusable guardrail sets (model allowlist + prompt capture) | guardrails/page.tsx + AgentGuardrailsTable + AgentGuardrailModal |
| `/agent-network/observability` | Tabs: Access Logs / Budget Dashboard / Budget Settings / Log Settings | observability/page.tsx |
| `/peers/devices`, `/peers/agents` | Split of `/peers`, shared via `PeersListView` keyed by `kind` | peers/{devices,agents}/page.tsx |
| `/reverse-proxy/self-hosted-proxies` | Renamed from `clusters` | self-hosted-proxies/page.tsx |
Removed in favor of `/agent-network/observability`: `/agent-network/access-log`, `/agent-network/consumption`, `/agent-network/global-controls`.
### New modules under src/modules/agent-network
| File | Role |
| ---- | ---- |
| AIProvidersProvider.tsx (~1158 LOC) | Aggregates every agent-network resource via SWR; normalises snake↔camel; exposes mutators; holds wizard-open state |
| AIProviderModal.tsx (~1268 LOC) | Connect / edit provider wizard with per-vendor copy (Bifrost, Portkey, LiteLLM, Cloudflare, Vercel, OpenRouter, custom) |
| AIProviderLogo + useProviderCatalog | Catalog-driven brand swatch + SWR hook over `/agent-network/catalog/providers` |
| AgentPoliciesTable + AgentPolicyModal + AgentPolicyGuardrailsTab + AgentPolicyLimitsTab | Policies; modal has 3 tabs (Rule, Limits, Guardrails) |
| AgentGuardrailsTable + AgentGuardrailModal + AgentGuardrailBrowseModal + AgentGuardrailChecksCell | Guardrails CRUD + attach-from-policy |
| AgentBudgetRulesTable + AgentBudgetRuleModal | Account-level budget rules; modal reuses AgentPolicyLimitsTab verbatim |
| AgentAccountControlsCard | Three account-wide toggles (Log Collection / Prompt Collection / Redact PII) |
| AgentAccessLogTable + AgentAccessLogExpandedRow | Access log on `/events/proxy?agent_network=true` |
| AgentConsumptionPanel + AgentConsumptionTable | Token + cost panel: charts + counter table |
| table/AgentProvidersTable + AgentProviderActionCell | Providers table + per-row actions |
| data/mockData.ts | Domain types and a few residual `MOCK_*` constants (see scrutinize) |
### Touched non-agent-network areas
- **control-center**: agent-network overlay (provider + agent-policy nodes); removed the All Networks dropdown; hid the Networks tab in FlowSelector (FlowSelector.tsx:9-14 — enum value kept so `?tab=networks` still type-checks); wrapped `ControlCenterView` in `AIProvidersProvider` (page.tsx:73-83); `agentPolicyNode` clicks routed to a separate state slot (page.tsx:1871-1874). New node renderers: nodes/ProviderNode.tsx, nodes/AgentPolicyNode.tsx (registered at utils/nodes.ts:21-22).
- **peers**: Split into Devices and Agents sub-routes; shared via `PeersListView` keyed by `kind` (PeersListView.tsx:24-95). New compact-toolbar `UserFilterSelector` (users/UserFilterSelector.tsx).
- **reverse-proxy**: Folder rename `clusters/``self-hosted-proxies/`; deleted `ClustersFeaturesCell.tsx`, `ClusterTypeIndicator.tsx`; new ReverseProxyClusterTargetSelector for cluster target type; Private toggle on target modal; body-capture knobs removed; new ReverseProxyEventExpandedRow.
- **events**: `ReverseProxyEventsUserCell` rewritten with user + peer fallback (ReverseProxyEventsUserCell.tsx:14-21), shared with the access-log table.
- **navigation**: Full repackaging in Navigation.tsx — Agent Network items flattened (no collapsible parent), distinct icons per item; Access Control, Networks, Reverse Proxy, DNS, standalone Guardrails, Consumption, Activity removed (still URL-reachable, per lines 165-171).
## Architecture & flow
### Page → Provider → Table/Modal hierarchy
```mermaid
graph TD
Nav[Navigation.tsx]
Nav --> ProvidersPage[/agent-network/providers/]
Nav --> PoliciesPage[/agent-network/policies/]
Nav --> GuardrailsPage[/agent-network/guardrails/]
Nav --> ObsPage[/agent-network/observability/]
ProvidersPage --> AIPP1[AIProvidersProvider]
PoliciesPage --> AIPP2[AIProvidersProvider]
GuardrailsPage --> AIPP3[AIProvidersProvider]
ObsPage --> AIPP4[AIProvidersProvider]
ObsPage -.wraps.-> GroupsProvider
ObsPage -.wraps.-> PeersProvider
AIPP1 --> ProvTable[AgentProvidersTable]
ProvTable --> ProvModal[AIProviderModal]
AIPP2 --> PolTable[AgentPoliciesTable]
PolTable --> PolModal[AgentPolicyModal]
PolModal --> PolGuardTab[AgentPolicyGuardrailsTab]
PolModal --> PolLimitsTab[AgentPolicyLimitsTab]
PolGuardTab --> GuardBrowse[AgentGuardrailBrowseModal]
PolGuardTab --> GuardModal[AgentGuardrailModal]
AIPP3 --> GuardTable[AgentGuardrailsTable]
GuardTable --> GuardModal
AIPP4 --> Tabs[Tabs]
Tabs --> AccessLog[AgentAccessLogTable]
Tabs --> Consumption[AgentConsumptionPanel]
Tabs --> BudgetRules[AgentBudgetRulesTable]
Tabs --> AccountCtl[AgentAccountControlsCard]
BudgetRules --> BudgetModal[AgentBudgetRuleModal]
BudgetModal -.reuses.-> PolLimitsTab
```
### AI Observability tab page
```mermaid
graph LR
Page[AIObservabilityPage] --> RA[RestrictedAccess<br/>permission.services.read]
RA --> GP[GroupsProvider]
GP --> PP[PeersProvider]
PP --> AIP[AIProvidersProvider]
AIP --> Tabs[Tabs / TabsList]
Tabs --> T1[Access Logs<br/>AgentAccessLogTable]
Tabs --> T2[Budget Dashboard<br/>AgentConsumptionPanel]
Tabs --> T3[Budget Settings<br/>AgentBudgetRulesTable]
Tabs --> T4[Log Settings<br/>AgentAccountControlsCard]
T1 -.GET.-> EP[/events/proxy?agent_network=true/]
T2 -.GET poll 5s.-> CONS[/agent-network/consumption/]
T3 -.GET/PUT.-> BR[/agent-network/budget-rules/]
T4 -.GET/PUT.-> ST[/agent-network/settings/]
```
### Data fetch path
```mermaid
graph TD
Page[Page component] --> Prov[AIProvidersProvider]
Prov -->|useFetchApi| SWR[(SWR cache<br/>key = URL)]
SWR -.GET.-> P[/agent-network/providers/]
SWR -.GET.-> POL[/agent-network/policies/]
SWR -.GET.-> G[/agent-network/guardrails/]
SWR -.GET.-> BR[/agent-network/budget-rules/]
SWR -.GET ignoreError.-> ST[/agent-network/settings/]
SWR -.GET.-> CAT[/agent-network/catalog/providers/]
SWR -.GET pageSize=100.-> EVT[/events/proxy agent_network=true/]
Prov --> Mut[useApiCall.post/put/del]
Mut -.on success.-> MutateSWR[SWR mutate keys]
Prov --> Children[Tables / Modals via useAIProviders]
```
Every list view reaches management through SWR over `/api/agent-network/*`. The provider context maps snake-case payloads to camelCase domain types (`fromAPI`, `policyFromAPI`, `guardrailFromAPI`, `budgetRuleFromAPI`, `settingsFromAPI`, `accessLogFromAPI` — AIProvidersProvider.tsx:138-562) and back via matching `*ToRequest` adaptors. The access log piggy-backs on `/events/proxy` with `agent_network=true&page_size=100` (line 707-709) and decodes LLM-specific fields from per-event `metadata`. Group IDs on events are resolved to current names through the surrounding GroupsProvider catalog (lines 515-521, 717-731) — no extra round trip. Mutators run `*ToRequest`, await `useApiCall.post/put/del`, call SWR `mutate()`, then `notify`. Errors caught and surfaced via `notify` — no exceptions escape into render. The Connect Provider modal's open state lives in the provider itself (`isWizardOpen` at lines 732-735) so the providers-page empty-state CTA and the table's + button share one modal. Control-center re-fetches `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider` — SWR de-dupes but the code path is harder to reason about.
## Public contracts consumed
- `GET/POST /api/agent-network/providers`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/policies`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/guardrails`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/budget-rules`, `PUT/DELETE /:id`
- `GET/PUT /api/agent-network/settings` (ignoreError-tolerant; 404 = not yet bootstrapped — auto-bootstrap on first provider create via `bootstrap_cluster` field — AIProvidersProvider.tsx:737-760)
- `GET /api/agent-network/catalog/providers` (read-only declarative; backend owns vendor list, IDs, brand colors, models, extra_headers, identity_injection — useProviderCatalog.ts:6-95)
- `GET /api/agent-network/consumption` (polled every 5s on Budget Dashboard — ConsumptionPanel.tsx:53,65-71)
- `GET /api/events/proxy?agent_network=true&page_size=100` (shared with Proxy Events)
- `permission?.services?.read` gates every agent-network route via RestrictedAccess.
`AIProviderId` is a closed union in dashboard types (data/mockData.ts:8-21) but the converter tolerates anything the backend ships — unknown ids fall through to `"custom"` (AIProvidersProvider.tsx:497-506). Catalog values are pure read-through: anything declared in `extra_headers` renders in the modal automatically, copy keyed by header name (`EXTRA_HEADER_UI` in AIProviderModal.tsx:61-89), labeled-fallback for unknown ones.
## Invariants
- Provider context wrap order on user-attribution pages: `GroupsProvider > PeersProvider > AIProvidersProvider` (observability/page.tsx:87-89). Reverse it and access-log group resolution silently drops names.
- Every agent-network route checks `permission?.services?.read` via `RestrictedAccess` (observability/page.tsx:85, providers/page.tsx:184, policies/page.tsx:53, guardrails/page.tsx:55).
- Modal `key={open ? 1 : 0}` pattern is used to force unmount/remount on close so internal `useState` resets between edits (AgentBudgetRuleModal.tsx:60, AgentPolicyModal.tsx:66). Removing this would leak prior-row state into a new-row session.
- `mockData.ts` is the canonical home for ALL agent-network domain types; `MOCK_*` constants must never reach a production code path. One leak remains (below).
## Things to scrutinize
### Correctness
- **Tab-state URL hand-off is one-way.** observability/page.tsx:53-58 reads `?tab=` on mount (despite the file comment at line 28 saying URL hand-off is future) but `setTab` does NOT push back, so reload preserves the chosen tab only if it came in via the link. Inconsistent with control-center (page.tsx:1817-1831).
- **Provider overlay runs only in `applySingleGroupView` / `applyPeerView`** (control-center/page.tsx:557, 1159-1166). User view does NOT show providers — if agent-network is a primary lens, that's a gap.
- **Two useEffects race to invalidate the control-center layout.** page.tsx:1655-1657 drops `layoutInitialized` when `agentPolicies` / `agentProviders` arrive; the main effect (1786-1799) also lists them as deps. Functional but fragile — watch for flash-of-empty-graph.
- **`updateProvider` / `updatePolicy` / `updateBudgetRule` use `??` on `enabled`** (AIProvidersProvider.tsx:784, 859, 1018). Toggle paths are safe; any caller sending `enabled: false` thinking "leave it off" gets `existing.enabled` instead. Audit modal callers.
- **Form validation in modals is minimal.** Window-seconds picker — mockData.ts:209-215 documents "minimum 60 — one minute" but there is no matching UI guard in PolicyLimitsTab; the backend validator is the enforcement point.
### Security
- **No client-side enforcement claims** — every cap, allowlist, and toggle is display + edit; proxy is the source of truth for deny decisions (AccessLogTable.tsx:177-191 renders backend-emitted `denyReason` as-is).
- **Prompt display is gated by what the backend stamps.** When `enable_prompt_collection` is OFF the proxy must not put prompt/completion into event metadata; the dashboard renders whatever it gets verbatim (AccessLogTable lines 532-534, AccessLogExpandedRow.tsx:42-57). No UI filter on top of backend collection switches.
- Account Controls disables `Redact PII` when `Prompt Collection` is off (AgentAccountControlsCard.tsx:122) and clears it on off-transition (line 100), but relies on backend to enforce the same gate at write — confirm PUT handler rejects `redact_pii=true && enable_prompt_collection=false`.
- **Bifrost identity-header overrides**: empty-string vs nil semantics documented in AIProvidersProvider.tsx:772-781 ("omitted = preserve, empty = explicit clear"). Mishandling could leak group attribution to a header the operator thought disabled. Focused read of Bifrost code path in AIProviderModal.tsx recommended.
### Accessibility
- Observability TabsList (observability/page.tsx:96-113) uses the shared Tabs component — should inherit Radix roving-tabindex. All four TabsTriggers carry only icon + text, no `aria-label`; fine because text is visible.
- Modal focus traps are inherited from the shared Modal; agent-network modals don't override them. Quick keyboard pass recommended.
- `EndpointBadge` Copy button (providers/page.tsx:66-76) has an `aria-label`, good.
### Performance
- `AgentConsumptionPanel` polls `/agent-network/consumption` every 5s (ConsumptionPanel.tsx:53,70). Tab switches unmount the panel, so the poll stops — verify in network panel.
- `AgentAccessLogTable` is hard-capped at 100 rows via `page_size=100` (AIProvidersProvider.tsx:707-709). Server-side pagination is future work; high-traffic tenants miss everything past row 100 — known limitation.
- Observability page mounts providers ONCE at page level (observability/page.tsx:87-89); tab switches keep SWR cache hot. Moving the provider mount inside `TabsContent` would re-fetch the access log on every switch.
### Visual consistency
- The observability tab style mirrors peers/page.tsx. Outer Tabs `pt-4 pb-0 mb-0`, TabsList `px-8` (observability/page.tsx:94-96) — confirm chrome height matches so the page doesn't visually jump.
- Sidebar: `Boxes` for Providers, `AccessControlIcon` for Policies, `TelescopeIcon` for AI Observability (Navigation.tsx:113,120,133). Reusing `AccessControlIcon` makes Policies look identical to the (now hidden) Access Control item — if Access Control ever comes back, they collide.
- `AgentNetworkIcon` is used in breadcrumbs on every agent-network page but NOT in the sidebar (per-page icons instead). Deliberate departure — record so it doesn't get reverted.
## Test coverage
- **Cypress**: One file (`cypress/e2e/test.cy.ts`) covering only the install-page copy-to-clipboard flow. NOTHING covers agent-network UI.
- **Component / unit tests**: `src/utils/version.test.ts` is the only `.test.*` file in the repo. The agent-network modules ship without component tests.
- Data-cy hooks exist on key controls: `save-account-controls` (AgentAccountControlsCard.tsx:71), `enable-log-collection`, `enable-prompt-collection`, `redact-pii`, plus existing `data-cy={policy.name}` / `data-cy={provider.name}` on ActiveInactiveRow. Sufficient hooks for Cypress flows; none written yet.
- **Tooling gap (pre-existing):** `npm run lint` (`next lint`) is broken in Next 16 — the `lint` subcommand was removed from the Next CLI in 16.x, so the dashboard effectively has no working lint gate. The fix is to add either a flat-config `eslint .` script or wire ESLint via an explicit `eslint-config-next` invocation.
## Known limitations / explicit non-goals
- **`data/mockData.ts` still contains `MOCK_GROUPS`, `MOCK_PROVIDERS`, `MOCK_PEERS`.** Only `MOCK_GROUPS` is referenced from production — AgentPoliciesTable.tsx:45,76 uses it as a name-lookup fallback when a policy references a group ID the real GroupsProvider doesn't know about. `MOCK_PROVIDERS` / `MOCK_PEERS` are unreferenced; safe to delete. The file is `/* eslint-disable */` so dead-code warnings don't flag them.
- **Tab-state URL hand-off on observability page is one-way** (read-only).
- **Access log hard-capped at 100 rows**; no server-side pagination.
- **No optimistic updates.** All mutations are round-trip; failures rollback via SWR revalidation.
- **`FlowView.NETWORKS` retained but hidden** from FlowSelector (FlowSelector.tsx:9-14). Old `?tab=networks` links still route to the hidden view because `applyNetworksView` still runs.
- **Redirects are not query-preserving** — `router.replace("/peers/devices")` (peers/page.tsx:13) strips any incoming filter params.
- **Control-center cross-fetches** `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider`. Could be collapsed.
- **Sidebar permanently hides Access Control, Networks, Reverse Proxy, standalone Guardrails, DNS, Activity, Consumption.** Routes still resolve via URL (Navigation.tsx:165-171); intentional.
## Cross-references
- Upstream API contracts: [shared/api](10-shared-api.md)
- Backend persistence: [management/store](20-management-store.md)
- Backend handler wiring: [management/handlers + wiring](22-management-handlers-wiring.md)
- End-to-end flow narrative: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level overview: [../00-overview.md](../00-overview.md)

View File

@@ -1,251 +0,0 @@
# path-routed providers — Vertex AI + Bedrock
This guide pulls the **path-routed** provider story together in one place
because it crosses the catalog, the synthesiser, the request parser, and the
router. The relevant building blocks are the `llm_router` /
`llm_request_parser` middlewares
([31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)), the
per-provider parser surface ([32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)),
and the synthesiser's catalog → `ProviderRoute` mapping
([21-management-agentnetwork.md](21-management-agentnetwork.md)).
Sibling modules: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
(router + request parser) and [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
(Bedrock parser + pricing).
---
## What "path-routed" means
Most catalog providers carry the model in the request **body** (`{"model": …}`),
so `llm_router` selects an upstream by matching the model name against each
provider's `Models` claim. Two providers instead carry the model in the **URL
path**, so they are routed by path before the model/vendor table is consulted:
| Catalog id | Style flag | Request path shape |
|---|---|---|
| `vertex_ai_api` | `IsVertexPathStyle``ProviderRoute.Vertex` | `/v1/projects/{project}/locations/{region}/publishers/{publisher}/models/{model}:{action}` |
| `bedrock_api` | `IsBedrockPathStyle``ProviderRoute.Bedrock` | `/model/{modelId}/{action}` (optionally behind `/bedrock`) |
The catalog declares the style with
[`catalog.IsVertexPathStyle` / `catalog.IsBedrockPathStyle`](../../../management/server/agentnetwork/catalog/catalog.go)
and the synthesiser copies the result onto the router route as the `Vertex` /
`Bedrock` booleans
([synthesizer.go:450-451](../../../management/server/agentnetwork/synthesizer.go)).
On the request leg `llm_router.Invoke` dispatches `isVertexPath` / `isBedrockPath`
**before** the model lookup
([llm_router/middleware.go:138-216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
so a model the parser extracted from the path can't be claimed by a same-vendor
*body-routed* provider (e.g. `claude-*` on `api.anthropic.com`).
## Google Vertex AI (`vertex_ai_api`)
### Catalog entry
`KindProvider`, parser surface left unset on the catalog entry — the request
parser picks the parser from the URL **publisher** segment, not from
`ParserID`. Upstream host is `<region>-aiplatform.googleapis.com`
(`https://aiplatform.googleapis.com` for the `global` location). The catalog
lists the Claude-on-Vertex lineup (`claude-opus-4-*`, `claude-sonnet-4-*`,
`claude-haiku-4-5`, `claude-fable-5`) at the same per-token rates as the
first-party Anthropic entry
([catalog.go:333-363](../../../management/server/agentnetwork/catalog/catalog.go)).
### Credential — service-account OAuth (`keyfile::`)
Vertex does **not** accept a static API key. The operator sets the provider
`api_key` to:
```
keyfile::<base64 of the GCP service-account JSON key>
```
The synthesiser recognises the `keyfile::` prefix in `providerAuthHeader`
([synthesizer.go:897-903](../../../management/server/agentnetwork/synthesizer.go)),
emits **no** static auth value, and carries the base64 key material on the
route as `GCPServiceAccountKeyB64`
([factory.go:56-61](../../../proxy/internal/middleware/builtin/llm_router/factory.go)).
At request time the router mints a short-lived OAuth2 access token from the key
(cloud-platform scope) and injects `Authorization: Bearer <access-token>`
never the key itself
([llm_router/middleware.go:621-692](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
- One auto-refreshing `oauth2.TokenSource` is cached per key (keyed by a
SHA-256 of the base64 material), so token minting happens once and refreshes
amortise across requests.
- Mint / refresh is bounded by a 10s timeout HTTP client (`gcpTokenTimeout`) so
a slow Google token endpoint can't hang the request.
- A malformed key or an unreachable token endpoint fails the request with
`llm_policy.upstream_auth_failed` at HTTP **502** (an upstream problem, not a
policy denial) — see `denyUpstreamAuth`.
### Metering — Anthropic-on-Vertex only
The request parser extracts `{publisher, model, action}` from the path
(`parseVertexPath`, [llm_request_parser/middleware.go:237-263](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)),
strips the `@version` suffix from the model, and maps the publisher to a parser
surface via `vertexPublisherVendor`:
- `anthropic``llm.provider="anthropic"` → metered through the Anthropic
parser, priced under the **`anthropic`** block in `defaults_pricing.yaml`
(the parser emits the standard Anthropic provider label, so Vertex Claude
reuses first-party Anthropic prices).
- `openai``llm.provider="openai"` (reserved; not in the catalog lineup
today).
- anything else (notably `google` / Gemini) → empty vendor → **no parser**.
**Gemini is intentionally denied as unmeterable.** When the parser emits no
`llm.provider` for a Vertex publisher, `llm_router` returns
`llm_policy.unmeterable_publisher` (403) rather than forwarding the request
uncounted — serving it would bypass token / budget metering
([llm_router/middleware.go:144-162, 712-728](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
A Gemini parser would lift this restriction; until then the `google` publisher
is omitted from the catalog.
> Caveat: cross-region inference profiles in `eu` / `apac` carry a ~10% price
> premium that the base per-token rates do **not** model — cost annotations for
> those regions read low. Operators who need exact regional billing override
> the affected entries in `pricing.yaml`.
## AWS Bedrock (`bedrock_api`)
### Catalog entry
`KindProvider`, upstream host `bedrock-runtime.<region>.amazonaws.com`. Metered
models are the Anthropic-on-Bedrock lineup (`anthropic.claude-*`) plus Amazon
Nova and Llama 3.3 entries
([catalog.go:300-332](../../../management/server/agentnetwork/catalog/catalog.go)).
Anthropic-on-Bedrock reuses the first-party Claude prices (with additive cache
buckets); Nova / Llama report no cache, so cost is `input + output`.
### Credential — static bearer token
Bedrock uses the **AWS Bedrock API key** as a static bearer. The operator sets
the provider `api_key` directly (no `keyfile::` prefix); the catalog template
is `Authorization: Bearer ${API_KEY}`
([catalog.go:306-307](../../../management/server/agentnetwork/catalog/catalog.go)).
No token minting — the synthesiser substitutes the key into the template and
the router injects the resulting `Authorization` header after stripping inbound
vendor auth (including client-supplied AWS SigV4 material: `X-Amz-Date`,
`X-Amz-Security-Token`, `X-Amz-Content-Sha256`, see `strippedAuthHeaders`).
### Model id form — cross-region inference profiles
Bedrock model ids in the request path must be the cross-region
**inference-profile** form, e.g.
`eu.anthropic.claude-sonnet-4-5-20250929-v1:0`. The bare
`anthropic.claude-…` id is rejected by AWS. `normalizeBedrockModel`
([llm_request_parser/middleware.go:398-414](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
strips the region prefix (`us.` / `eu.` / `apac.` / `global.`), an optional ARN
wrapper, and the `-YYYYMMDD-vN[:N]` version/throughput suffix so the normalised
id (`anthropic.claude-sonnet-4-5`) matches the catalog/pricing key.
### Supported endpoints + actions
`/model/{modelId}/{action}` where action ∈ `invoke`,
`invoke-with-response-stream`, `converse`, `converse-stream`
([llm_request_parser/middleware.go:363-390](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
`invoke` / `converse` are non-streaming; the `-stream` actions set the streaming
flag.
- **InvokeModel** body uses the vendor-native shape — for Anthropic that means
`"anthropic_version":"bedrock-2023-05-31"` and snake_case usage with additive
cache buckets.
- **Converse** uses the unified camelCase shape with a precomputed `totalTokens`.
- The `BedrockParser` reads both shapes on the response leg
([bedrock.go](../../../proxy/internal/llm/bedrock.go)); the request parser
doesn't need to distinguish them (`ParseRequest` is a no-op — model + stream
come from the path).
### Streaming — AWS binary event-stream
The `-stream` actions return `application/vnd.amazon.eventstream` (the AWS
binary event-stream framing), and streaming **is metered**.
`accumulateBedrockStream`
([llm_response_parser/streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go))
decodes the frames with `aws-sdk-go-v2/aws/protocol/eventstream`:
- InvokeModel `chunk` frames wrap a base64 `{"bytes":…}` payload carrying a
vendor-native (Anthropic) stream event — folded through the shared Anthropic
stream accumulator.
- Converse `contentBlockDelta` frames carry text; the trailing `metadata` frame
carries the final usage block.
- A truncated stream (cut at the body-tap capture cap) decodes best-effort:
frames up to the cut are applied and partial usage is returned.
### Optional `/bedrock` gateway-namespace prefix
Clients may place an optional `/bedrock` prefix before the native path
(`/bedrock/model/{modelId}/{action}`) to disambiguate Bedrock from other
providers that also use `/model/...`. Both the request parser
(`trimBedrockNamespace`) and the router (`splitBedrockNamespace`) accept it.
When the prefix is present, the router sets
`RewriteUpstream.StripPathPrefix = "/bedrock"` so the **native** path
(`/model/...`) is what reaches `bedrock-runtime.<region>.amazonaws.com`
([llm_router/middleware.go:168-184, 320-348](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
## Model allowlist on path-routed providers
Because the model lives in the URL rather than the body, a path-routed provider
credential could otherwise be used for any model the upstream supports. The
router still enforces the route's `Models` allowlist via `matchPathRoute`
([llm_router/middleware.go:370-416](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
1. Filter to routes of the matching style (`Vertex` / `Bedrock`).
2. Filter to routes whose `AllowedGroupIDs` authorise the caller's groups
(else `no_authorised_provider`).
3. Filter to routes that **claim the requested model**. As with body-routed
providers, an **empty `Models` list = catch-all** (serve any model);
a non-empty list serves only the listed models (else `model_not_routable`).
4. Multiple survivors disambiguate by longest `UpstreamPath` prefix match.
So an operator who lists explicit models on a Vertex/Bedrock provider gets a
hard allowlist; an operator who leaves `Models` empty accepts every model the
upstream serves (still subject to the unmeterable-publisher gate on Vertex).
Model-less OpenAI endpoints (`GET /v1/models`) are **never** routed to a
Vertex/Bedrock provider — `matchModelless` skips path-routed routes
([llm_router/middleware.go:427-462](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
so a model-listing call can't be rewritten onto an upstream that would 404 it.
## Catalog ↔ pricing cross-check
Catalog prices and context windows are cross-checked against LiteLLM's
`model_prices_and_context_window.json`. The proxy's embedded
`defaults_pricing.yaml` covers **every metered first-party model** the catalog
enumerates — guarded by
`TestDefaultTable_FirstPartyModelCoverage`
([pricing/defaults_coverage_test.go](../../../proxy/internal/llm/pricing/defaults_coverage_test.go)),
which fails if a catalog model has no embedded price. Bedrock entries are keyed
by the **normalised** id the request parser emits (region prefix + version
suffix stripped). Vertex Claude carries no Bedrock-style prefix, so it prices
straight off the `anthropic` block.
## Things to scrutinise
**Security.** The Vertex service-account key is never forwarded — only a minted
short-lived bearer. Confirm the key material stays out of access logs (it lives
on `ProviderRoute.GCPServiceAccountKeyB64`, not in any emitted metadata key).
The unmeterable-publisher deny is the only thing standing between an
operator-misconfigured Vertex provider and unmetered Gemini traffic; verify
`vertexPublisherVendor` stays conservative (deny by default for unknown
publishers).
**Correctness.** `normalizeBedrockModel` is the join between the wire id and the
pricing key — a model that normalises to something not in `defaults_pricing.yaml`
meters at `cost.skipped=unknown_model` rather than failing the request. The
`/bedrock` prefix strip must run on both the parser side (so the model is
extracted) and the router side (so the upstream path is native); a regression in
either silently breaks the other.
**Metering caveats.** eu/apac cross-region Bedrock + Vertex profiles carry a
~10% premium not modelled by base pricing — flagged in both the catalog comment
and `defaults_pricing.yaml`. Operators needing exact regional billing override
the relevant entries.
## Cross-references
- Router + request-parser detail: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
- Bedrock parser + pricing + SSE / event-stream: [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
- Catalog → route synthesis + `keyfile::` handling: [21-management-agentnetwork.md](21-management-agentnetwork.md)
- Overview: [../00-overview.md](../00-overview.md)

4
go.mod
View File

@@ -35,7 +35,6 @@ require (
github.com/DeRuina/timberjack v1.4.2
github.com/awnumar/memguard v0.23.0
github.com/aws/aws-sdk-go-v2 v1.38.3
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1
github.com/aws/aws-sdk-go-v2/config v1.31.6
github.com/aws/aws-sdk-go-v2/credentials v1.18.10
github.com/aws/aws-sdk-go-v2/service/s3 v1.87.3
@@ -54,7 +53,7 @@ require (
github.com/eko/gocache/lib/v4 v4.2.0
github.com/eko/gocache/store/go_cache/v4 v4.2.2
github.com/eko/gocache/store/redis/v4 v4.2.2
github.com/fsnotify/fsnotify v1.9.0
github.com/fsnotify/fsnotify v1.10.1
github.com/gliderlabs/ssh v0.3.8
github.com/go-jose/go-jose/v4 v4.1.4
github.com/gobwas/ws v1.4.0
@@ -157,6 +156,7 @@ require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/awnumar/memcall v0.4.0 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect

4
go.sum
View File

@@ -173,8 +173,8 @@ github.com/fredbi/uri v1.1.1/go.mod h1:4+DZQ5zBjEwQCDmXW5JdIjz0PUA+yJbvtBv+u+adr
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/fsnotify/fsnotify v1.10.1 h1:b0/UzAf9yR5rhf3RPm9gf3ehBPpf0oZKIjtpKrx59Ho=
github.com/fsnotify/fsnotify v1.10.1/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
github.com/fxamacker/cbor/v2 v2.9.1 h1:2rWm8B193Ll4VdjsJY28jxs70IdDsHRWgQYAI80+rMQ=
github.com/fxamacker/cbor/v2 v2.9.1/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
github.com/fyne-io/gl-js v0.2.0 h1:+EXMLVEa18EfkXBVKhifYB6OGs3HwKO3lUElA0LlAjs=

View File

@@ -1,616 +0,0 @@
#!/bin/bash
set -e
set -o pipefail
# NetBird Enterprise — Getting Started
# Single-node bootstrap for a self-hosted NetBird Enterprise stack with the
# embedded identity provider. Owner is created via first-login flow.
SED_STRIP_PADDING='s/=//g'
check_docker_compose() {
if command -v docker-compose &> /dev/null; then
echo "docker-compose"
return
fi
if docker compose --help &> /dev/null; then
echo "docker compose"
return
fi
echo "docker-compose is not installed or not in PATH. See https://docs.docker.com/engine/install/" > /dev/stderr
exit 1
}
check_openssl() {
if ! command -v openssl &> /dev/null; then
echo "openssl is not installed or not in PATH." > /dev/stderr
exit 1
fi
}
rand_secret() {
openssl rand -base64 32 | sed "$SED_STRIP_PADDING"
}
rand_b64_key() {
openssl rand -base64 32
}
check_nb_domain() {
local domain="$1"
if [[ -z "$domain" ]]; then
echo "The domain cannot be empty." > /dev/stderr
return 1
fi
if [[ "$domain" == "netbird.example.com" ]]; then
echo "The domain cannot be netbird.example.com" > /dev/stderr
return 1
fi
if [[ "$domain" =~ ^[0-9.]+$ ]]; then
echo "An IP address is not allowed. A real DNS-resolvable domain is required for TLS and the embedded IdP issuer." > /dev/stderr
return 1
fi
if [[ ! "$domain" =~ ^[A-Za-z0-9]([A-Za-z0-9-]*[A-Za-z0-9])?(\.[A-Za-z0-9]([A-Za-z0-9-]*[A-Za-z0-9])?)+$ ]]; then
echo "The value '$domain' is not a valid FQDN. A real DNS-resolvable domain is required for TLS and the embedded IdP issuer." > /dev/stderr
return 1
fi
return 0
}
check_domain_resolves() {
local domain="$1"
if command -v getent &> /dev/null && getent hosts "$domain" &> /dev/null; then return 0; fi
if command -v host &> /dev/null && host "$domain" &> /dev/null; then return 0; fi
if command -v dig &> /dev/null && [[ -n "$(dig +short "$domain" 2>/dev/null)" ]]; then return 0; fi
if command -v nslookup &> /dev/null && nslookup "$domain" &> /dev/null; then return 0; fi
return 1
}
read_nb_domain() {
local value=""
echo -n "Enter the FQDN for NetBird (must resolve via DNS, e.g. netbird.my-domain.com): " > /dev/stderr
read -r value < /dev/tty
if ! check_nb_domain "$value"; then
read_nb_domain
return
fi
if ! check_domain_resolves "$value"; then
echo "" > /dev/stderr
echo "Warning: '$value' does not resolve via DNS from this host." > /dev/stderr
echo "Caddy will not be able to issue TLS certificates until it does." > /dev/stderr
local confirm=""
echo -n "Continue anyway? [y/N]: " > /dev/stderr
read -r confirm < /dev/tty
if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
read_nb_domain
return
fi
fi
echo "$value"
}
read_required() {
local prompt="$1"
local value=""
while [[ -z "$value" ]]; do
echo -n "$prompt: " > /dev/stderr
read -r value < /dev/tty
if [[ -z "$value" ]]; then
echo "Value cannot be empty." > /dev/stderr
fi
done
echo "$value"
}
read_secret() {
local prompt="$1"
local value=""
while [[ -z "$value" ]]; do
echo -n "$prompt: " > /dev/stderr
read -rs value < /dev/tty
echo "" > /dev/stderr
if [[ -z "$value" ]]; then
echo "Value cannot be empty." > /dev/stderr
fi
done
echo "$value"
}
# read_yes_no "<prompt>" [<default y|n>]
read_yes_no() {
local prompt="$1"
local default="${2:-n}"
local hint
if [[ "$default" == "y" ]]; then
hint="[Y/n]"
else
hint="[y/N]"
fi
echo -n "${prompt} ${hint}: " > /dev/stderr
local ans=""
read -r ans < /dev/tty
if [[ -z "$ans" ]]; then
ans="$default"
fi
case "$ans" in
[Yy] | [Yy][Ee][Ss]) echo "yes" ;;
*) echo "no" ;;
esac
}
wait_postgres() {
set +e
echo -n "Waiting for postgres to become ready"
local counter=1
while true; do
if $DOCKER_COMPOSE_COMMAND exec -T postgres pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" &> /dev/null; then
break
fi
if [[ $counter -eq 60 ]]; then
echo ""
echo "Postgres is taking too long. Recent logs:"
$DOCKER_COMPOSE_COMMAND logs --tail=20 postgres
exit 1
fi
echo -n " ."
sleep 2
counter=$((counter + 1))
done
echo " done"
set -e
}
init_environment() {
check_openssl
DOCKER_COMPOSE_COMMAND=$(check_docker_compose)
if [[ -f .env ]] || [[ -f docker-compose.yml ]] || [[ -f config.yaml ]] || [[ -f Caddyfile ]]; then
echo "Generated files already exist in $(pwd)."
echo "If you want to reinitialize the environment, please remove them first:"
echo " $DOCKER_COMPOSE_COMMAND down --volumes # removes all containers and volumes"
echo " rm -f .env docker-compose.yml Caddyfile config.yaml"
echo "Be aware this will remove all data from the database."
exit 1
fi
echo "NetBird Enterprise bootstrap"
echo ""
echo "Traffic flow:"
echo " Enables traffic events logging on the management server."
echo " When enabled, the NetBird stack also runs NATS along with two"
echo " additional containers: netbird-receiver (the traffic log receiver"
echo " service) and netbird-enricher (the traffic log enricher service)."
echo " It still has to be turned on from the dashboard settings afterwards."
echo " See https://docs.netbird.io/manage/activity/traffic-events-logging"
NETBIRD_TRAFFIC_FLOW=$(read_yes_no "Enable traffic flow" "n")
echo ""
NETBIRD_DOMAIN=$(read_nb_domain)
echo ""
NETBIRD_LICENSE_KEY=$(read_secret "Enter license key (input hidden)")
GHCR_USERNAME="netbirdExtAccess1"
GHCR_TOKEN=$(read_secret "Enter GHCR token (input hidden)")
POSTGRES_USER="netbird"
POSTGRES_DB="netbird"
POSTGRES_PASSWORD=$(rand_secret)
NETBIRD_ENCRYPTION_KEY=$(rand_b64_key)
NETBIRD_RELAY_AUTH_SECRET=$(rand_secret)
POSTGRES_DSN="host=postgres user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} dbname=${POSTGRES_DB} port=5432 sslmode=disable TimeZone=UTC"
NETBIRD_RELAY_ENDPOINT="rels://${NETBIRD_DOMAIN}:443"
echo ""
echo "Selected:"
echo " Traffic flow: ${NETBIRD_TRAFFIC_FLOW}"
echo " Domain: ${NETBIRD_DOMAIN}"
echo ""
echo "Rendering files into $(pwd) ..."
install -m 600 /dev/null .env
render_env >> .env
render_docker_compose > docker-compose.yml
if [[ -z "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
sed -i.bak '/NETBIRD_LICENSE_SERVER_BASE_URL/d' docker-compose.yml && rm -f docker-compose.yml.bak
fi
render_caddyfile > Caddyfile
install -m 600 /dev/null config.yaml
render_config_yaml >> config.yaml
echo "Logging in to ghcr.io ..."
printf '%s' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
unset GHCR_TOKEN
echo ""
echo "Pulling images ..."
$DOCKER_COMPOSE_COMMAND pull
echo ""
echo "Starting postgres ..."
$DOCKER_COMPOSE_COMMAND up -d postgres
sleep 2
wait_postgres
echo ""
echo "Starting remaining services ..."
$DOCKER_COMPOSE_COMMAND up -d
echo ""
echo "Done."
echo ""
echo "Dashboard: https://${NETBIRD_DOMAIN}"
echo ""
echo "Open the dashboard in a browser to complete the first-login owner setup."
echo "All configuration and secrets are stored (mode 600) in $(pwd)/.env"
echo ""
echo "Tail logs:"
echo " cd $(pwd) && $DOCKER_COMPOSE_COMMAND logs -f netbird-server caddy"
}
# ------------------------------------------------------------------
# Renderers
# ------------------------------------------------------------------
render_env() {
cat <<EOF
# Generated by getting-started-enterprise.sh
# Holds all configuration and secrets for the stack. Mode 600.
# Features (set by the script; don't edit without re-running)
NETBIRD_TRAFFIC_FLOW_ENABLED=${NETBIRD_TRAFFIC_FLOW}
# Domain
NETBIRD_DOMAIN=${NETBIRD_DOMAIN}
# Image tags. Default to "latest"
NETBIRD_DASHBOARD_TAG=${NETBIRD_DASHBOARD_TAG:-latest}
NETBIRD_SERVER_TAG=${NETBIRD_SERVER_TAG:-latest}
EOF
if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
cat <<EOF
NETBIRD_ENRICHER_TAG=${NETBIRD_ENRICHER_TAG:-latest}
NETBIRD_RECEIVER_TAG=${NETBIRD_RECEIVER_TAG:-latest}
EOF
fi
cat <<EOF
# License keys
EOF
if [[ -n "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
cat <<EOF
NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
EOF
fi
cat <<EOF
NETBIRD_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
EOF
cat <<EOF
# Postgres
POSTGRES_USER=${POSTGRES_USER}
POSTGRES_DB=${POSTGRES_DB}
POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
NETBIRD_STORE_ENGINE_POSTGRES_DSN=${POSTGRES_DSN}
# Relay
NETBIRD_RELAY_ENDPOINT=${NETBIRD_RELAY_ENDPOINT}
NETBIRD_RELAY_AUTH_SECRET=${NETBIRD_RELAY_AUTH_SECRET}
# Datastore encryption
NETBIRD_ENCRYPTION_KEY=${NETBIRD_ENCRYPTION_KEY}
# Dashboard OIDC scopes
NETBIRD_AUTH_SUPPORTED_SCOPES=${NETBIRD_AUTH_SUPPORTED_SCOPES:-openid profile email groups}
EOF
}
render_docker_compose() {
render_compose_header
render_compose_common
render_compose_server
if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
render_compose_flow
fi
render_compose_postgres
render_compose_footer
}
render_compose_header() {
cat <<'EOF'
x-default: &default
restart: unless-stopped
logging:
driver: json-file
options:
max-size: '500m'
max-file: '2'
services:
EOF
}
render_compose_common() {
cat <<'EOF'
caddy:
<<: *default
image: caddy:2
container_name: netbird-caddy
networks: [netbird]
environment:
- CADDY_SECURE_DOMAIN=${NETBIRD_DOMAIN}
ports:
- '443:443'
- '443:443/udp'
- '80:80'
volumes:
- netbird_caddy_data:/data
- ./Caddyfile:/etc/caddy/Caddyfile
dashboard:
<<: *default
image: ghcr.io/netbirdio/dashboard-cloud:${NETBIRD_DASHBOARD_TAG}
container_name: netbird-dashboard
networks: [netbird]
environment:
- NETBIRD_MGMT_API_ENDPOINT=https://${NETBIRD_DOMAIN}
- NETBIRD_MGMT_GRPC_API_ENDPOINT=https://${NETBIRD_DOMAIN}
- AUTH_AUDIENCE=netbird-dashboard
- AUTH_CLIENT_ID=netbird-dashboard
- AUTH_CLIENT_SECRET=
- AUTH_AUTHORITY=https://${NETBIRD_DOMAIN}/oauth2
- USE_AUTH0=false
- AUTH_SUPPORTED_SCOPES=${NETBIRD_AUTH_SUPPORTED_SCOPES}
- AUTH_REDIRECT_URI=/nb-auth
- AUTH_SILENT_REDIRECT_URI=/nb-silent-auth
- NETBIRD_TOKEN_SOURCE=accessToken
- NGINX_SSL_PORT=443
- LETSENCRYPT_DOMAIN=
- LETSENCRYPT_EMAIL=
EOF
}
render_compose_server() {
cat <<'EOF'
netbird-server:
<<: *default
image: ghcr.io/netbirdio/netbird-server-cloud:${NETBIRD_SERVER_TAG}
container_name: netbird-server
networks: [netbird]
depends_on:
dashboard:
condition: service_started
postgres:
condition: service_healthy
ports:
- '3478:3478/udp'
volumes:
- netbird_data:/var/lib/netbird
- ./config.yaml:/etc/netbird/config.yaml
command: ["--config", "/etc/netbird/config.yaml"]
environment:
- NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
- NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
EOF
}
render_compose_flow() {
cat <<'EOF'
nats:
<<: *default
image: nats:2
container_name: netbird-nats
networks: [netbird]
volumes:
- netbird_nats_data:/data
command: ["-m", "8222", "--jetstream", "--store_dir", "/data"]
enricher:
<<: *default
image: ghcr.io/netbirdio/flow-enricher-cloud:${NETBIRD_ENRICHER_TAG}
container_name: netbird-enricher
networks: [netbird]
depends_on:
postgres:
condition: service_healthy
nats:
condition: service_started
volumes:
- netbird_enricher:/var/lib/netbird
environment:
- NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
- NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
- NB_DATADIR=/var/lib/netbird
- NB_MANAGEMENT_STORE_ENGINE=postgres
- NB_MANAGEMENT_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
- NETBIRD_STORE_ENGINE_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
- NB_TRAFFIC_EVENT_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
- NB_TRAFFIC_EVENT_STORE_ENGINE=postgres
- NB_MANAGEMENT_STORE_KEY=${NETBIRD_ENCRYPTION_KEY}
- NB_FLOW_ADAPTER_TYPE=nats
- NB_FLOW_NATS_ENDPOINTS=nats://nats:4222
- NB_FLOW_NATS_STREAM=traffic-events
- NB_METRICS_PORT=9091
- NB_PERSISTENCE_RETENTION_PERIOD=168h
receiver:
<<: *default
image: ghcr.io/netbirdio/flow-receiver-cloud:${NETBIRD_RECEIVER_TAG}
container_name: netbird-receiver
networks: [netbird]
depends_on:
nats:
condition: service_started
environment:
- NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
- NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
- NB_FLOW_LISTEN_PORT=80
- NB_FLOW_ADAPTER_TYPE=nats
- NB_FLOW_NATS_ENDPOINTS=nats://nats:4222
- NB_FLOW_NATS_STREAM=traffic-events
- NB_FLOW_AUTH_SECRET=${NETBIRD_RELAY_AUTH_SECRET}
EOF
}
render_compose_postgres() {
cat <<'EOF'
postgres:
<<: *default
image: postgres:17
container_name: netbird-postgres
networks: [netbird]
environment:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
interval: 10s
timeout: 5s
retries: 10
volumes:
- netbird_postgres:/var/lib/postgresql/data
EOF
}
render_compose_footer() {
cat <<'EOF'
volumes:
netbird_data:
EOF
if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
cat <<'EOF'
netbird_nats_data:
netbird_enricher:
EOF
fi
cat <<'EOF'
netbird_postgres:
netbird_caddy_data:
networks:
netbird:
EOF
}
render_caddyfile() {
cat <<'EOF'
{
servers :80,:443 {
protocols h1 h2c h2 h3
}
}
(security_headers) {
header * {
Strict-Transport-Security "max-age=3600; includeSubDomains; preload"
X-Content-Type-Options "nosniff"
X-Frame-Options "SAMEORIGIN"
X-XSS-Protection "1; mode=block"
-Server
Referrer-Policy strict-origin-when-cross-origin
}
}
:80 {
redir https://{$CADDY_SECURE_DOMAIN}{uri} permanent
}
{$CADDY_SECURE_DOMAIN}:443 {
import security_headers
# Signal (gRPC over h2c)
reverse_proxy /signalexchange.SignalExchange/* h2c://netbird-server:80
# Management (gRPC over h2c + HTTP)
reverse_proxy /management.ManagementService/* h2c://netbird-server:80
reverse_proxy /api/* netbird-server:80
reverse_proxy /ws-proxy/* netbird-server:80
# Embedded IdP (OAuth2 endpoints served by netbird server)
reverse_proxy /oauth2/* netbird-server:80
# Relay (WebSocket multiplexed on the same port)
reverse_proxy /relay* netbird-server:80
EOF
if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
cat <<'EOF'
# Flow receiver (gRPC over h2c)
reverse_proxy /flow.FlowService/* h2c://receiver:80
EOF
fi
cat <<'EOF'
# Dashboard
reverse_proxy /* dashboard:80
}
EOF
}
render_config_yaml() {
cat <<EOF
# NetBird Enterprise server configuration.
# Generated by getting-started-enterprise.sh. Mode 600.
server:
listenAddress: ":80"
exposedAddress: "https://${NETBIRD_DOMAIN}:443"
metricsPort: 9090
healthcheckAddress: ":9000"
logLevel: "info"
logFile: "console"
# TLS is terminated by Caddy in front; leave this block empty.
tls:
certFile: ""
keyFile: ""
letsencrypt:
enabled: false
authSecret: "${NETBIRD_RELAY_AUTH_SECRET}"
dataDir: "/var/lib/netbird/"
disableAnonymousMetrics: false
disableGeoliteUpdate: false
auth:
issuer: "https://${NETBIRD_DOMAIN}/oauth2"
localAuthDisabled: false
signKeyRefreshEnabled: false
dashboardRedirectURIs:
- "https://${NETBIRD_DOMAIN}/nb-auth"
- "https://${NETBIRD_DOMAIN}/nb-silent-auth"
cliRedirectURIs:
- "http://localhost:53000/"
store:
engine: "postgres"
dsn: "${POSTGRES_DSN}"
encryptionKey: "${NETBIRD_ENCRYPTION_KEY}"
activityStore:
engine: "postgres"
dsn: "${POSTGRES_DSN}"
EOF
if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
cat <<EOF
trafficFlow:
enabled: true
address: "https://${NETBIRD_DOMAIN}:443"
interval: "60s"
EOF
fi
}
init_environment

View File

@@ -398,42 +398,7 @@ configure_domain() {
return 0
}
apply_agent_network_preset() {
# Agent-network turnkey install: built-in Traefik + NetBird Proxy with
# NB_PROXY_PRIVATE=true, dashboard locked to agent-network-only mode.
# Bypasses every reverse-proxy / proxy / CrowdSec prompt. The only
# inputs we still need from the operator are the domain (handled by
# configure_domain via NETBIRD_DOMAIN env var or interactive prompt)
# and the ACME email — both honor env vars first and fall back to a
# prompt only when unset. CrowdSec is intentionally off.
REVERSE_PROXY_TYPE="0"
ENABLE_PROXY="true"
ENABLE_CROWDSEC="false"
if [[ -n "${NETBIRD_LETSENCRYPT_EMAIL}" ]]; then
TRAEFIK_ACME_EMAIL="${NETBIRD_LETSENCRYPT_EMAIL}"
else
TRAEFIK_ACME_EMAIL=$(read_traefik_acme_email)
fi
echo "" > /dev/stderr
echo "Agent-network preset enabled (NETBIRD_AGENT_NETWORK=true):" > /dev/stderr
echo " - reverse proxy: built-in Traefik" > /dev/stderr
echo " - NetBird Proxy: enabled with NB_PROXY_PRIVATE=true" > /dev/stderr
echo " - dashboard: NETBIRD_AGENT_NETWORK_ONLY=true" > /dev/stderr
echo " - CrowdSec: disabled" > /dev/stderr
echo " - Let's Encrypt email: ${TRAEFIK_ACME_EMAIL}" > /dev/stderr
echo "" > /dev/stderr
}
configure_reverse_proxy() {
# Short-circuit: agent-network preset locks every reverse-proxy /
# proxy / CrowdSec choice and bypasses the interactive prompts.
if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
apply_agent_network_preset
return 0
fi
# Prompt for reverse proxy type
REVERSE_PROXY_TYPE=$(read_reverse_proxy_type)
@@ -945,15 +910,6 @@ NGINX_SSL_PORT=443
# Letsencrypt
LETSENCRYPT_DOMAIN=none
EOF
if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
cat <<EOF
# Agent-network preset: dashboard hides the standard NetBird surfaces
# and exposes only the AI Observability + agent-network configuration
# pages. Paired with NB_PROXY_PRIVATE=true on the proxy side.
NETBIRD_AGENT_NETWORK_ONLY=true
EOF
fi
return 0
}
@@ -990,17 +946,6 @@ NB_PROXY_PROXY_PROTOCOL=true
NB_PROXY_TRUSTED_PROXIES=$TRAEFIK_IP
EOF
if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
cat <<EOF
# Agent-network preset: turn the proxy into the private reverse-proxy
# ingress for agent-network synth services. Disables the public-facing
# surface so the proxy serves only synth-generated routes (the
# llm_router-driven LLM endpoints) and the per-account inbound
# listeners on the embedded netstack.
NB_PROXY_PRIVATE=true
EOF
fi
if [[ "$ENABLE_CROWDSEC" == "true" && -n "$CROWDSEC_BOUNCER_KEY" ]]; then
cat <<EOF
NB_PROXY_CROWDSEC_API_URL=http://crowdsec:8080

View File

@@ -1,638 +0,0 @@
#!/bin/bash
set -e
set -o pipefail
# NetBird — community combined → Enterprise combined migration
#
# Non-destructive migration: produces docker-compose.override.yml (auto-loaded
# by docker compose) and config.yaml.enterprise alongside the operator's
# existing files. Original docker-compose.yml and config.yaml are never
# modified.
#
# Steps (all optional, asked interactively):
# 1. Image swap — replace community images with enterprise cloud images.
# 2. Postgres migration — add Postgres, migrate SQLite data via migrate-store.
# 3. Traffic flow — add NATS + flow-enricher + flow-receiver.
#
# To revert:
# docker compose down
# rm -f docker-compose.override.yml config.yaml.enterprise
# # If Postgres migration was done, also restore the SQLite backup printed
# # at the end of this script's run.
# docker compose up -d
OVERRIDE_FILE="docker-compose.override.yml"
ENTERPRISE_CONFIG_FILE="config.yaml.enterprise"
check_docker_compose() {
if command -v docker-compose &> /dev/null; then
echo "docker-compose"
return
fi
if docker compose --help &> /dev/null; then
echo "docker compose"
return
fi
echo "docker-compose is not installed or not in PATH." > /dev/stderr
exit 1
}
check_yq() {
if ! command -v yq &> /dev/null; then
cat > /dev/stderr <<'EOF'
yq is required to parse and update YAML safely.
macOS: brew install yq
Linux: https://github.com/mikefarah/yq/releases (download binary into PATH)
Debian: apt-get install yq (Note: must be the mikefarah Go yq, not the Python wrapper.)
EOF
exit 1
fi
if ! yq --version 2>&1 | grep -q "mikefarah"; then
echo "yq is present but appears to be the wrong implementation. The mikefarah Go-based yq is required (https://github.com/mikefarah/yq)." > /dev/stderr
exit 1
fi
}
check_openssl() {
if ! command -v openssl &> /dev/null; then
echo "openssl is not installed or not in PATH." > /dev/stderr
exit 1
fi
}
rand_password() {
openssl rand -hex 32
}
read_required() {
local prompt="$1"
local value=""
while [[ -z "$value" ]]; do
echo -n "$prompt: " > /dev/stderr
read -r value < /dev/tty
if [[ -z "$value" ]]; then
echo "Value cannot be empty." > /dev/stderr
fi
done
echo "$value"
}
read_secret() {
local prompt="$1"
local value=""
while [[ -z "$value" ]]; do
echo -n "$prompt: " > /dev/stderr
read -rs value < /dev/tty
echo "" > /dev/stderr
if [[ -z "$value" ]]; then
echo "Value cannot be empty." > /dev/stderr
fi
done
echo "$value"
}
read_yes_no() {
local prompt="$1"
local default="${2:-n}"
local hint
if [[ "$default" == "y" ]]; then
hint="[Y/n]"
else
hint="[y/N]"
fi
echo -n "${prompt} ${hint}: " > /dev/stderr
local ans=""
read -r ans < /dev/tty
if [[ -z "$ans" ]]; then
ans="$default"
fi
case "$ans" in
[Yy] | [Yy][Ee][Ss]) echo "yes" ;;
*) echo "no" ;;
esac
}
# ---------------------------------------------------------------------------
# Detection — read the operator's existing compose to find service names and
# paths we need to override. Bail loudly if shape isn't recognised.
# ---------------------------------------------------------------------------
detect_combined_service() {
yq eval '.services | to_entries | map(select(.value.image | test("^netbirdio/netbird-server"))) | .[0].key // ""' "$COMPOSE_FILE"
}
detect_dashboard_service() {
yq eval '.services | to_entries | map(select(.value.image | test("^netbirdio/dashboard"))) | .[0].key // ""' "$COMPOSE_FILE"
}
detect_config_yaml_host_path() {
yq eval ".services[\"$COMBINED_SERVICE\"].volumes[] | select(. | test(\":/etc/netbird/config.yaml\")) | sub(\":/etc/netbird/config.yaml.*\"; \"\") // \"\"" "$COMPOSE_FILE" | head -1
}
detect_data_volume() {
yq eval ".services[\"$COMBINED_SERVICE\"].volumes[] | select(. | test(\":/var/lib/netbird\")) | sub(\":/var/lib/netbird.*\"; \"\") // \"\"" "$COMPOSE_FILE" | head -1
}
detect_exposed_address() {
yq eval '.server.exposedAddress // ""' "$CONFIG_YAML_HOST"
}
detect_compose_network() {
local tag
tag=$(yq eval ".services[\"$COMBINED_SERVICE\"].networks | tag" "$COMPOSE_FILE" 2>/dev/null)
case "$tag" in
"!!seq")
yq eval ".services[\"$COMBINED_SERVICE\"].networks[0]" "$COMPOSE_FILE"
;;
"!!map")
yq eval ".services[\"$COMBINED_SERVICE\"].networks | keys | .[0]" "$COMPOSE_FILE"
;;
*)
echo "default"
;;
esac
}
# ---------------------------------------------------------------------------
# Renderers
# ---------------------------------------------------------------------------
# Build docker-compose.override.yml from the steps the operator selected.
# Service names match what we detected on the operator's side.
render_override() {
cat <<EOF
# Generated by migrate-to-enterprise.sh. Mode 644.
# Merged with docker-compose.yml automatically by Docker Compose.
# Remove this file (and config.yaml.enterprise if present) to revert.
services:
${DASHBOARD_SERVICE}:
image: \${NETBIRD_DASHBOARD_IMAGE:-ghcr.io/netbirdio/dashboard-cloud:latest}
${COMBINED_SERVICE}:
image: \${NETBIRD_SERVER_IMAGE:-ghcr.io/netbirdio/netbird-server-cloud:latest}
environment:
NB_LICENSE_KEY: \${NB_LICENSE_KEY}
NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
EOF
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
cat <<EOF
depends_on:
postgres:
condition: service_healthy
volumes:
- ./${ENTERPRISE_CONFIG_FILE}:/etc/netbird/config.yaml.enterprise:ro
command: ["--config", "/etc/netbird/config.yaml.enterprise"]
postgres:
image: postgres:17
container_name: netbird-postgres
restart: unless-stopped
networks: [${COMPOSE_NETWORK}]
environment:
POSTGRES_USER: netbird
POSTGRES_PASSWORD: \${POSTGRES_PASSWORD}
POSTGRES_DB: netbird
volumes:
- netbird_postgres:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U netbird -d netbird"]
interval: 5s
timeout: 5s
retries: 20
EOF
fi
if [[ "$ENABLE_FLOW" == "yes" ]]; then
cat <<EOF
nats:
image: nats:2
container_name: netbird-nats
restart: unless-stopped
networks: [${COMPOSE_NETWORK}]
command: ["-m", "8222", "--jetstream", "--store_dir", "/data"]
volumes:
- netbird_nats_data:/data
flow-enricher:
image: ghcr.io/netbirdio/flow-enricher-cloud:latest
container_name: netbird-flow-enricher
restart: unless-stopped
networks: [${COMPOSE_NETWORK}]
depends_on:
postgres:
condition: service_healthy
nats:
condition: service_started
environment:
NB_LICENSE_KEY: \${NB_LICENSE_KEY}
NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
NB_DATADIR: /var/lib/netbird
NB_MANAGEMENT_STORE_ENGINE: postgres
NB_MANAGEMENT_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
NB_STORE_ENGINE_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
NB_TRAFFIC_EVENT_STORE_ENGINE: postgres
NB_TRAFFIC_EVENT_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
NB_MANAGEMENT_STORE_KEY: \${NETBIRD_ENCRYPTION_KEY}
NB_FLOW_ADAPTER_TYPE: nats
NB_FLOW_NATS_ENDPOINTS: nats://nats:4222
NB_FLOW_NATS_STREAM: traffic-events
NB_METRICS_PORT: 9091
NB_PERSISTENCE_RETENTION_PERIOD: 168h
flow-receiver:
image: ghcr.io/netbirdio/flow-receiver-cloud:latest
container_name: netbird-flow-receiver
restart: unless-stopped
networks: [${COMPOSE_NETWORK}]
depends_on:
nats:
condition: service_started
environment:
NB_LICENSE_KEY: \${NB_LICENSE_KEY}
NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
NB_FLOW_LISTEN_PORT: 80
NB_FLOW_ADAPTER_TYPE: nats
NB_FLOW_NATS_ENDPOINTS: nats://nats:4222
NB_FLOW_NATS_STREAM: traffic-events
NB_FLOW_AUTH_SECRET: \${NB_FLOW_AUTH_SECRET}
labels:
- traefik.enable=true
- traefik.http.routers.netbird-flow.rule=Host(\`${NETBIRD_HOSTNAME}\`) && PathPrefix(\`/flow.FlowService/\`)
- traefik.http.routers.netbird-flow.entrypoints=websecure
- traefik.http.routers.netbird-flow.tls=true
- traefik.http.routers.netbird-flow.tls.certresolver=letsencrypt
- traefik.http.routers.netbird-flow.service=netbird-flow-h2c
- traefik.http.routers.netbird-flow.priority=100
- traefik.http.services.netbird-flow-h2c.loadbalancer.server.port=80
- traefik.http.services.netbird-flow-h2c.loadbalancer.server.scheme=h2c
EOF
fi
# Volume declarations for anything new the override introduced
local has_volumes="no"
if [[ "$MIGRATE_POSTGRES" == "yes" ]] || [[ "$ENABLE_FLOW" == "yes" ]]; then
has_volumes="yes"
fi
if [[ "$has_volumes" == "yes" ]]; then
cat <<EOF
volumes:
EOF
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
echo " netbird_postgres:"
fi
if [[ "$ENABLE_FLOW" == "yes" ]]; then
echo " netbird_nats_data:"
fi
fi
}
# Build config.yaml.enterprise by yq-editing the operator's existing
# config.yaml. We don't touch the original file.
render_enterprise_config() {
local pg_dsn="host=postgres user=netbird password=${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
yq eval "
.server.store.engine = \"postgres\" |
.server.store.dsn = \"$pg_dsn\" |
.server.activityStore.engine = \"postgres\" |
.server.activityStore.dsn = \"$pg_dsn\" |
.server.authStore.engine = \"postgres\" |
.server.authStore.dsn = \"$pg_dsn\"
" "$CONFIG_YAML_HOST" > "$ENTERPRISE_CONFIG_FILE"
if [[ "$ENABLE_FLOW" == "yes" ]]; then
local flow_addr="${NETBIRD_DOMAIN}"
yq eval -i "
.server.trafficFlow.enabled = true |
.server.trafficFlow.address = \"$flow_addr\" |
.server.trafficFlow.interval = \"60s\"
" "$ENTERPRISE_CONFIG_FILE"
fi
}
# ---------------------------------------------------------------------------
# Execution steps
# ---------------------------------------------------------------------------
resolve_data_volume() {
local short="$1"
local actual
# Resolve project-prefixed volume name from Docker Compose config first.
actual=$($DOCKER_COMPOSE_COMMAND config 2>/dev/null | yq eval ".volumes.\"$short\".name" - 2>/dev/null)
if [[ -n "$actual" && "$actual" != "null" ]]; then
echo "$actual"
return
fi
# Relative bind mount: docker-compose resolves it against the compose
# file's directory, but `docker run -v` resolves it against the current
# working directory. Normalize to an absolute path so both interpretations
# agree (and the printed revert command works from any CWD).
if [[ "$short" == ./* || "$short" == ../* ]]; then
local compose_dir
compose_dir="$(cd "$(dirname "$COMPOSE_FILE")" && pwd)"
(
cd "$compose_dir"
cd "$(dirname "$short")"
printf '%s/%s\n' "$(pwd)" "$(basename "$short")"
)
return
fi
# Not a named volume (e.g. an absolute bind-mount path) — use it as-is.
echo "$short"
}
backup_sqlite() {
BACKUP_DIR="$(pwd)/backups/sqlite-pre-enterprise-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
local data_volume_actual
data_volume_actual=$(resolve_data_volume "$DATA_VOLUME")
echo "Backing up SQLite store from volume '$data_volume_actual' to $BACKUP_DIR ..."
docker run --rm \
-v "${data_volume_actual}:/var/lib/netbird:ro" \
-v "${BACKUP_DIR}:/backup" \
busybox \
sh -c 'cp -a /var/lib/netbird/. /backup/ 2>/dev/null || true'
local copied
copied=$(find "$BACKUP_DIR" -mindepth 1 | head -1)
if [[ -z "$copied" ]]; then
echo " ⚠ Backup directory is empty — the volume '$data_volume_actual' didn't contain data. Aborting." > /dev/stderr
exit 1
fi
echo " done"
}
run_migrate_store() {
echo "Running migrate-store (SQLite → Postgres) ..."
$DOCKER_COMPOSE_COMMAND run --rm "$COMBINED_SERVICE" migrate-store --config /etc/netbird/config.yaml.enterprise --verify
echo " done"
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
init_migration() {
DOCKER_COMPOSE_COMMAND=$(check_docker_compose)
check_yq
check_openssl
COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.yml}"
if [[ ! -f "$COMPOSE_FILE" ]]; then
echo "$COMPOSE_FILE not found in $(pwd)." > /dev/stderr
exit 1
fi
if [[ -f "$OVERRIDE_FILE" ]] || [[ -f "$ENTERPRISE_CONFIG_FILE" ]]; then
echo "Migration artifacts already exist in $(pwd):"
[[ -f "$OVERRIDE_FILE" ]] && echo " $OVERRIDE_FILE"
[[ -f "$ENTERPRISE_CONFIG_FILE" ]] && echo " $ENTERPRISE_CONFIG_FILE"
echo ""
echo "Either you've already migrated, or a previous run was interrupted."
echo "To re-run cleanly: rm -f $OVERRIDE_FILE $ENTERPRISE_CONFIG_FILE"
exit 1
fi
COMBINED_SERVICE=$(detect_combined_service)
DASHBOARD_SERVICE=$(detect_dashboard_service)
CONFIG_YAML_HOST=$(detect_config_yaml_host_path)
DATA_VOLUME=$(detect_data_volume)
COMPOSE_NETWORK=$(detect_compose_network)
if [[ -z "$COMBINED_SERVICE" ]]; then
echo "Could not find a service running netbirdio/netbird-server* in $COMPOSE_FILE." > /dev/stderr
echo "This script targets the community combined-server deployment." > /dev/stderr
exit 1
fi
if [[ -z "$DASHBOARD_SERVICE" ]]; then
echo "Could not find a service running netbirdio/dashboard* in $COMPOSE_FILE." > /dev/stderr
exit 1
fi
if [[ -z "$CONFIG_YAML_HOST" ]]; then
echo "Could not find a config.yaml mount on $COMBINED_SERVICE (expected to bind-mount to /etc/netbird/config.yaml)." > /dev/stderr
exit 1
fi
if [[ ! -f "$CONFIG_YAML_HOST" ]]; then
echo "config.yaml host file not found at $CONFIG_YAML_HOST." > /dev/stderr
exit 1
fi
if [[ -z "$DATA_VOLUME" ]]; then
echo "Could not find a volume mounted at /var/lib/netbird on $COMBINED_SERVICE." > /dev/stderr
exit 1
fi
echo "Detected existing deployment:"
echo " Combined service: $COMBINED_SERVICE"
echo " Dashboard: $DASHBOARD_SERVICE"
echo " config.yaml: $CONFIG_YAML_HOST"
echo " Data volume: $DATA_VOLUME"
echo " Network: $COMPOSE_NETWORK"
echo ""
local proceed
proceed=$(read_yes_no "Proceed with migration?" "y")
if [[ "$proceed" != "yes" ]]; then
echo "Aborted."
exit 0
fi
# Step 1 — always (this is the point of the script)
MIGRATE_IMAGES="yes"
echo ""
echo "Step 1: Image swap (community → Enterprise). License key required."
NB_LICENSE_KEY=$(read_secret " License key")
GHCR_USERNAME="netbirdExtAccess1"
GHCR_TOKEN=$(read_secret " GHCR token (input hidden)")
# Step 2 — optional
echo ""
MIGRATE_POSTGRES=$(read_yes_no "Step 2: Migrate storage from SQLite to Postgres? (recommended)" "n")
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
echo ""
echo " ⚠ Data will be migrated from SQLite to Postgres. The SQLite store"
echo " will be backed up automatically. To fully revert later, restore"
echo " that backup and delete docker-compose.override.yml +"
echo " config.yaml.enterprise."
local confirm
confirm=$(read_yes_no " Continue?" "y")
if [[ "$confirm" != "yes" ]]; then
MIGRATE_POSTGRES="no"
echo " Skipping Postgres migration."
else
POSTGRES_PASSWORD=$(rand_password)
fi
fi
# Step 3 — optional, only if Postgres is on (flow requires Postgres)
echo ""
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
ENABLE_FLOW=$(read_yes_no "Step 3: Enable traffic flow? (requires Postgres)" "n")
if [[ "$ENABLE_FLOW" == "yes" ]]; then
# Auth secret MUST match server.authSecret from config.yaml
NB_FLOW_AUTH_SECRET=$(yq eval '.server.authSecret // ""' "$CONFIG_YAML_HOST")
if [[ -z "$NB_FLOW_AUTH_SECRET" ]] || [[ "$NB_FLOW_AUTH_SECRET" == "null" ]]; then
echo "Could not read server.authSecret from $CONFIG_YAML_HOST." > /dev/stderr
echo "Flow receiver auth must match the combined server's authSecret." > /dev/stderr
exit 1
fi
NETBIRD_DOMAIN=$(detect_exposed_address)
if [[ -z "$NETBIRD_DOMAIN" ]] || [[ "$NETBIRD_DOMAIN" == "null" ]]; then
NETBIRD_DOMAIN=$(read_required " Public NetBird URL (e.g. https://netbird.example.com)")
fi
# Strip protocol + port to leave just the hostname for the Traefik Host() rule.
NETBIRD_HOSTNAME=$(echo "$NETBIRD_DOMAIN" | sed -E 's,^https?://,,' | sed 's,:.*,,' | sed 's,/.*,,')
# We need the encryption key from the existing config.yaml for the enricher
NETBIRD_ENCRYPTION_KEY=$(yq eval '.server.store.encryptionKey // ""' "$CONFIG_YAML_HOST")
if [[ -z "$NETBIRD_ENCRYPTION_KEY" ]] || [[ "$NETBIRD_ENCRYPTION_KEY" == "null" ]]; then
echo "Could not read server.store.encryptionKey from $CONFIG_YAML_HOST." > /dev/stderr
exit 1
fi
fi
else
ENABLE_FLOW="no"
echo "Step 3 (traffic flow) skipped — requires Postgres."
fi
}
apply_changes() {
echo ""
echo "Writing $OVERRIDE_FILE ..."
install -m 644 /dev/null "$OVERRIDE_FILE"
render_override > "$OVERRIDE_FILE"
if [[ -z "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
sed -i.bak '/NETBIRD_LICENSE_SERVER_BASE_URL/d' "$OVERRIDE_FILE" && rm -f "$OVERRIDE_FILE.bak"
fi
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
echo "Writing $ENTERPRISE_CONFIG_FILE ..."
install -m 600 /dev/null "$ENTERPRISE_CONFIG_FILE"
render_enterprise_config
fi
# Persist secrets that the override file references via env interpolation.
# We write them to a .env file in the current directory; docker compose
# picks it up automatically.
echo "Writing .env additions (mode 600) ..."
local ENV_FILE=".env"
touch "$ENV_FILE"
chmod 600 "$ENV_FILE"
{
echo ""
echo "# Added by migrate-to-enterprise.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "NB_LICENSE_KEY=${NB_LICENSE_KEY}"
if [[ -n "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
echo "NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}"
fi
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
echo "POSTGRES_PASSWORD=${POSTGRES_PASSWORD}"
fi
if [[ "$ENABLE_FLOW" == "yes" ]]; then
echo "NB_FLOW_AUTH_SECRET=${NB_FLOW_AUTH_SECRET}"
echo "NETBIRD_ENCRYPTION_KEY=${NETBIRD_ENCRYPTION_KEY}"
fi
} >> "$ENV_FILE"
echo ""
echo "Logging in to ghcr.io ..."
printf '%s' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
unset GHCR_TOKEN
echo ""
echo "Pulling enterprise images ..."
$DOCKER_COMPOSE_COMMAND pull
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
echo ""
echo "Stopping existing services (volumes preserved) ..."
$DOCKER_COMPOSE_COMMAND down
backup_sqlite
echo ""
echo "Starting Postgres ..."
$DOCKER_COMPOSE_COMMAND up -d postgres
# Wait for healthy
local counter=0
echo -n "Waiting for Postgres to become ready"
while ! $DOCKER_COMPOSE_COMMAND exec -T postgres pg_isready -U netbird -d netbird &> /dev/null; do
echo -n " ."
sleep 2
counter=$((counter + 1))
if [[ $counter -ge 60 ]]; then
echo ""
echo "Postgres did not become ready in 120s. Recent logs:"
$DOCKER_COMPOSE_COMMAND logs --tail=20 postgres
exit 1
fi
done
echo " done"
run_migrate_store
fi
echo ""
echo "Bringing up all services ..."
$DOCKER_COMPOSE_COMMAND up -d
echo ""
echo "Migration complete."
}
print_summary() {
echo ""
echo "──────────────────────────────────────────────────────────────────────"
echo " Summary"
echo "──────────────────────────────────────────────────────────────────────"
echo " Images: swapped to enterprise"
[[ "$MIGRATE_POSTGRES" == "yes" ]] && echo " Storage: Postgres (data migrated from SQLite)"
[[ "$MIGRATE_POSTGRES" != "yes" ]] && echo " Storage: SQLite (unchanged)"
[[ "$ENABLE_FLOW" == "yes" ]] && echo " Traffic flow: enabled"
[[ "$ENABLE_FLOW" != "yes" ]] && echo " Traffic flow: disabled"
echo ""
echo " Generated files (next to your docker-compose.yml):"
echo " $OVERRIDE_FILE"
[[ "$MIGRATE_POSTGRES" == "yes" ]] && echo " $ENTERPRISE_CONFIG_FILE"
echo " .env (license key + secrets, mode 600)"
[[ "$MIGRATE_POSTGRES" == "yes" ]] && echo " backups/sqlite-pre-enterprise-*/ (SQLite backup)"
echo ""
echo " Tail logs:"
echo " $DOCKER_COMPOSE_COMMAND logs -f $COMBINED_SERVICE"
echo ""
echo "──────────────────────────────────────────────────────────────────────"
echo " To revert"
echo "──────────────────────────────────────────────────────────────────────"
echo " $DOCKER_COMPOSE_COMMAND down"
if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
# Resolve project-prefixed volume names now (before override is removed).
local pg_volume data_volume_actual
pg_volume=$(resolve_data_volume "netbird_postgres")
data_volume_actual=$(resolve_data_volume "$DATA_VOLUME")
echo " # Remove the Postgres volume FIRST, before deleting the override file:"
echo " docker volume rm $pg_volume"
echo " # Restore SQLite from the backup created during this run:"
echo " docker run --rm -v ${data_volume_actual}:/var/lib/netbird -v ${BACKUP_DIR}:/backup busybox sh -c 'cp -a /backup/. /var/lib/netbird/'"
fi
echo " rm -f $OVERRIDE_FILE $ENTERPRISE_CONFIG_FILE"
echo " # Remove migrate-to-enterprise.sh additions from .env (search for the timestamp marker)"
echo " $DOCKER_COMPOSE_COMMAND up -d"
echo "──────────────────────────────────────────────────────────────────────"
}
# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------
init_migration
apply_changes
print_summary

View File

@@ -2,5 +2,4 @@ FROM ubuntu:24.04
RUN apt update && apt install -y ca-certificates && rm -fr /var/cache/apt
ENTRYPOINT [ "/go/bin/netbird-mgmt","management"]
CMD ["--log-file", "console"]
ARG TARGETPLATFORM
COPY ${TARGETPLATFORM}/netbird-mgmt /go/bin/netbird-mgmt
COPY netbird-mgmt /go/bin/netbird-mgmt

View File

@@ -0,0 +1,5 @@
FROM ubuntu:24.04
RUN apt update && apt install -y ca-certificates && rm -fr /var/cache/apt
ENTRYPOINT [ "/go/bin/netbird-mgmt","management","--log-level","debug"]
CMD ["--log-file", "console"]
COPY netbird-mgmt /go/bin/netbird-mgmt

View File

@@ -45,7 +45,7 @@ type Controller struct {
EphemeralPeersManager ephemeral.Manager
accountUpdateLocks sync.Map
affectedPeerUpdateLocks sync.Map
sendAccountUpdateLocks sync.Map
updateAccountPeersBufferInterval atomic.Int64
// dnsDomain is used for peer resolution. This is appended to the peer's name
dnsDomain string
@@ -64,13 +64,6 @@ type bufferUpdate struct {
update atomic.Bool
}
type bufferAffectedUpdate struct {
sendMu sync.Mutex
dataMu sync.Mutex
next *time.Timer
peerIDs map[string]struct{}
}
var _ network_map.Controller = (*Controller)(nil)
func NewController(ctx context.Context, store store.Store, metrics telemetry.AppMetrics, peersUpdateManager network_map.PeersUpdateManager, requestBuffer account.RequestBuffer, integratedPeerValidator integrated_validator.IntegratedValidator, settingsManager settings.Manager, dnsDomain string, proxyController port_forwarding.Controller, ephemeralPeersManager ephemeral.Manager, config *config.Config) *Controller {
@@ -116,24 +109,6 @@ func (c *Controller) OnPeerDisconnected(ctx context.Context, accountID string, p
c.EphemeralPeersManager.OnPeerDisconnected(ctx, peer)
}
// injectAllProxyPolicies prepares an account for the per-peer network-map
// computation. It prepends the in-memory agent-network services synthesised
// from the account's current provider/policy state to account.Services so
// the existing InjectProxyPolicies + injectPrivateServicePolicies walks pick
// them up alongside persisted reverse-proxy services. Synthesised services
// are never persisted; the account is loaded fresh per cycle so re-prepending
// is safe and idempotent. Accounts without agent-network providers get an
// empty synth slice — no behaviour change.
func (c *Controller) injectAllProxyPolicies(ctx context.Context, account *types.Account) {
synth, err := c.repo.SynthesizeAgentNetworkServices(ctx, account.Id)
if err != nil {
log.WithContext(ctx).Warnf("synthesise agent-network services for account %s: %v", account.Id, err)
} else if len(synth) > 0 {
account.Services = append(synth, account.Services...)
}
account.InjectProxyPolicies(ctx)
}
func (c *Controller) CountStreams() int {
return c.peersUpdateManager.CountStreams()
}
@@ -168,7 +143,7 @@ func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID strin
var wg sync.WaitGroup
semaphore := make(chan struct{}, 10)
c.injectAllProxyPolicies(ctx, account)
account.InjectProxyPolicies(ctx)
dnsCache := &cache.DNSConfigCache{}
dnsDomain := c.GetDNSDomain(account.Settings)
peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
@@ -226,7 +201,7 @@ func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID strin
c.metrics.CountCalcPeerNetworkMapDuration(time.Since(start))
proxyNetworkMap, ok := proxyNetworkMaps[p.ID]
proxyNetworkMap, ok := proxyNetworkMaps[peer.ID]
if ok {
remotePeerNetworkMap.Merge(proxyNetworkMap)
}
@@ -251,6 +226,44 @@ func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID strin
return nil
}
func (c *Controller) bufferSendUpdateAccountPeers(ctx context.Context, accountID string, reason types.UpdateReason) error {
log.WithContext(ctx).Tracef("buffer sending update peers for account %s from %s", accountID, util.GetCallerName())
if c.accountManagerMetrics != nil {
c.accountManagerMetrics.CountUpdateAccountPeersTriggered(string(reason.Resource), string(reason.Operation))
}
bufUpd, _ := c.sendAccountUpdateLocks.LoadOrStore(accountID, &bufferUpdate{})
b := bufUpd.(*bufferUpdate)
if !b.mu.TryLock() {
b.update.Store(true)
return nil
}
if b.next != nil {
b.next.Stop()
}
go func() {
defer b.mu.Unlock()
_ = c.sendUpdateAccountPeers(ctx, accountID, reason)
if !b.update.Load() {
return
}
b.update.Store(false)
if b.next == nil {
b.next = time.AfterFunc(time.Duration(c.updateAccountPeersBufferInterval.Load()), func() {
_ = c.sendUpdateAccountPeers(ctx, accountID, reason)
})
return
}
b.next.Reset(time.Duration(c.updateAccountPeersBufferInterval.Load()))
}()
return nil
}
// UpdatePeers updates all peers that belong to an account.
// Should be called when changes have to be synced to peers.
func (c *Controller) UpdateAccountPeers(ctx context.Context, accountID string, reason types.UpdateReason) error {
@@ -260,151 +273,6 @@ func (c *Controller) UpdateAccountPeers(ctx context.Context, accountID string, r
return c.sendUpdateAccountPeers(ctx, accountID, reason)
}
// UpdateAffectedPeers updates only the specified peers that belong to an account.
func (c *Controller) UpdateAffectedPeers(ctx context.Context, accountID string, peerIDs []string) error {
if len(peerIDs) == 0 {
return nil
}
return c.sendUpdateForAffectedPeers(ctx, accountID, peerIDs)
}
func (c *Controller) sendUpdateForAffectedPeers(ctx context.Context, accountID string, peerIDs []string) error {
log.WithContext(ctx).Tracef("sendUpdateForAffectedPeers: account %s, %d affected peers: %v (caller: %s)", accountID, len(peerIDs), peerIDs, util.GetCallerName())
if !c.hasConnectedPeers(peerIDs) {
log.WithContext(ctx).Tracef("sendUpdateForAffectedPeers: no connected peers among %v, skipping", peerIDs)
return nil
}
account, err := c.requestBuffer.GetAccountWithBackpressure(ctx, accountID)
if err != nil {
return fmt.Errorf("failed to get account: %v", err)
}
globalStart := time.Now()
peersToUpdate := c.filterConnectedAffectedPeers(account, peerIDs)
if len(peersToUpdate) == 0 {
log.WithContext(ctx).Tracef("sendUpdateForAffectedPeers: no peers to update (affected peers not found in account or no channels)")
return nil
}
log.WithContext(ctx).Tracef("sendUpdateForAffectedPeers: sending network map to %d connected peers", len(peersToUpdate))
approvedPeersMap, err := c.integratedPeerValidator.GetValidatedPeers(ctx, account.Id, maps.Values(account.Groups), maps.Values(account.Peers), account.Settings.Extra)
if err != nil {
return fmt.Errorf("failed to get validate peers: %v", err)
}
var wg sync.WaitGroup
semaphore := make(chan struct{}, 10)
// The affected-peer path MUST mirror sendUpdateAccountPeers (line 171)
// here: injectAllProxyPolicies prepends the synthesised agent-network
// services BEFORE InjectProxyPolicies + private-service policies run.
// Previously this path called only account.InjectProxyPolicies, which
// skipped the synth-services prepend — so peer-level changes
// (proxy restart, embedded peer connect/disconnect) propagated a
// network map that omitted the synth DNS zone, and the agent kept
// resolving against the stale or absent record.
c.injectAllProxyPolicies(ctx, account)
dnsCache := &cache.DNSConfigCache{}
dnsDomain := c.GetDNSDomain(account.Settings)
peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
proxyNetworkMaps, err := c.proxyController.GetProxyNetworkMapsAll(ctx, accountID, account.Peers)
if err != nil {
log.WithContext(ctx).Errorf("failed to get proxy network maps: %v", err)
return fmt.Errorf("failed to get proxy network maps: %v", err)
}
extraSetting, err := c.settingsManager.GetExtraSettings(ctx, accountID)
if err != nil {
return fmt.Errorf("failed to get flow enabled status: %v", err)
}
dnsFwdPort := computeForwarderPort(maps.Values(account.Peers), network_map.DnsForwarderPortMinVersion)
accountZones, err := c.repo.GetAccountZones(ctx, account.Id)
if err != nil {
log.WithContext(ctx).Errorf("failed to get account zones: %v", err)
return fmt.Errorf("failed to get account zones: %v", err)
}
for _, peer := range peersToUpdate {
wg.Add(1)
semaphore <- struct{}{}
go func(p *nbpeer.Peer) {
defer wg.Done()
defer func() { <-semaphore }()
start := time.Now()
postureChecks, err := c.getPeerPostureChecks(account, p.ID)
if err != nil {
log.WithContext(ctx).Debugf("failed to get posture checks for peer %s: %v", p.ID, err)
return
}
c.metrics.CountCalcPostureChecksDuration(time.Since(start))
start = time.Now()
remotePeerNetworkMap := account.GetPeerNetworkMapFromComponents(ctx, p.ID, peersCustomZone, accountZones, approvedPeersMap, resourcePolicies, routers, c.accountManagerMetrics, groupIDToUserIDs)
c.metrics.CountCalcPeerNetworkMapDuration(time.Since(start))
proxyNetworkMap, ok := proxyNetworkMaps[p.ID]
if ok {
remotePeerNetworkMap.Merge(proxyNetworkMap)
}
peerGroups := account.GetPeerGroups(p.ID)
start = time.Now()
update := grpc.ToSyncResponse(ctx, nil, c.config.HttpConfig, c.config.DeviceAuthorizationFlow, p, nil, nil, remotePeerNetworkMap, dnsDomain, postureChecks, dnsCache, account.Settings, extraSetting, maps.Keys(peerGroups), dnsFwdPort)
c.metrics.CountToSyncResponseDuration(time.Since(start))
c.peersUpdateManager.SendUpdate(ctx, p.ID, &network_map.UpdateMessage{
Update: update,
MessageType: network_map.MessageTypeNetworkMap,
})
}(peer)
}
wg.Wait()
if c.accountManagerMetrics != nil {
c.accountManagerMetrics.CountUpdateAccountPeersDuration(time.Since(globalStart))
}
return nil
}
func (c *Controller) hasConnectedPeers(peerIDs []string) bool {
for _, id := range peerIDs {
if c.peersUpdateManager.HasChannel(id) {
return true
}
}
return false
}
func (c *Controller) filterConnectedAffectedPeers(account *types.Account, peerIDs []string) []*nbpeer.Peer {
affected := make(map[string]struct{}, len(peerIDs))
for _, id := range peerIDs {
affected[id] = struct{}{}
}
var result []*nbpeer.Peer
for _, peer := range account.Peers {
if _, ok := affected[peer.ID]; ok && c.peersUpdateManager.HasChannel(peer.ID) {
result = append(result, peer)
}
}
return result
}
func (c *Controller) UpdateAccountPeer(ctx context.Context, accountId string, peerId string) error {
if !c.peersUpdateManager.HasChannel(peerId) {
return fmt.Errorf("peer %s doesn't have a channel, skipping network map update", peerId)
@@ -425,7 +293,7 @@ func (c *Controller) UpdateAccountPeer(ctx context.Context, accountId string, pe
return fmt.Errorf("failed to get validated peers: %v", err)
}
c.injectAllProxyPolicies(ctx, account)
account.InjectProxyPolicies(ctx)
dnsCache := &cache.DNSConfigCache{}
dnsDomain := c.GetDNSDomain(account.Settings)
peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
@@ -513,162 +381,66 @@ func (c *Controller) BufferUpdateAccountPeers(ctx context.Context, accountID str
return nil
}
// BufferUpdateAffectedPeers accumulates peer IDs and flushes them after the buffer interval.
func (c *Controller) BufferUpdateAffectedPeers(ctx context.Context, accountID string, peerIDs []string, reason types.UpdateReason) error {
if len(peerIDs) == 0 {
return nil
}
if c.accountManagerMetrics != nil {
c.accountManagerMetrics.CountUpdateAccountPeersTriggered(string(reason.Resource), string(reason.Operation))
}
log.WithContext(ctx).Tracef("buffer updating %d affected peers for account %s from %s with reason %s/%s", len(peerIDs), accountID, util.GetCallerName(), reason.Operation, reason.Resource)
bufUpd, _ := c.affectedPeerUpdateLocks.LoadOrStore(accountID, &bufferAffectedUpdate{
peerIDs: make(map[string]struct{}),
})
b := bufUpd.(*bufferAffectedUpdate)
b.addPeerIDs(peerIDs)
if !b.sendMu.TryLock() {
// Another goroutine is already sending; it will pick up our IDs on its next drain.
return nil
}
b.stopTimer()
// The send and the debounced timer outlive the calling request, so detach from
// its context to avoid sending with a cancelled context once the handler returns.
bgCtx := context.WithoutCancel(ctx)
collected := b.drainPeerIDs()
go func() {
defer b.sendMu.Unlock()
_ = c.sendUpdateForAffectedPeers(bgCtx, accountID, collected)
// Check if more peer IDs accumulated while we were sending.
if !b.hasPending() {
return
}
// Schedule a debounced flush for the newly accumulated IDs.
b.setTimer(time.Duration(c.updateAccountPeersBufferInterval.Load()), func() {
ids := b.drainPeerIDs()
if len(ids) > 0 {
_ = c.sendUpdateForAffectedPeers(bgCtx, accountID, ids)
}
})
}()
return nil
}
func (b *bufferAffectedUpdate) addPeerIDs(ids []string) {
b.dataMu.Lock()
for _, id := range ids {
b.peerIDs[id] = struct{}{}
}
b.dataMu.Unlock()
}
func (b *bufferAffectedUpdate) drainPeerIDs() []string {
b.dataMu.Lock()
defer b.dataMu.Unlock()
if len(b.peerIDs) == 0 {
return nil
}
ids := make([]string, 0, len(b.peerIDs))
for id := range b.peerIDs {
ids = append(ids, id)
}
b.peerIDs = make(map[string]struct{})
return ids
}
func (b *bufferAffectedUpdate) hasPending() bool {
b.dataMu.Lock()
defer b.dataMu.Unlock()
return len(b.peerIDs) > 0
}
func (b *bufferAffectedUpdate) stopTimer() {
b.dataMu.Lock()
defer b.dataMu.Unlock()
if b.next != nil {
b.next.Stop()
}
}
func (b *bufferAffectedUpdate) setTimer(d time.Duration, f func()) {
b.dataMu.Lock()
defer b.dataMu.Unlock()
if b.next == nil {
b.next = time.AfterFunc(d, f)
return
}
b.next.Reset(d)
}
func (c *Controller) GetValidatedPeerWithMap(ctx context.Context, isRequiresApproval bool, accountID string, peerID string) (*types.NetworkMap, []*posture.Checks, int64, error) {
func (c *Controller) GetValidatedPeerWithMap(ctx context.Context, isRequiresApproval bool, accountID string, peer *nbpeer.Peer) (*nbpeer.Peer, *types.NetworkMap, []*posture.Checks, int64, error) {
if isRequiresApproval {
network, err := c.repo.GetAccountNetwork(ctx, accountID)
if err != nil {
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
emptyMap := &types.NetworkMap{
Network: network.Copy(),
}
return emptyMap, nil, 0, nil
return peer, emptyMap, nil, 0, nil
}
account, err := c.requestBuffer.GetAccountWithBackpressure(ctx, accountID)
if err != nil {
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
c.injectAllProxyPolicies(ctx, account)
account.InjectProxyPolicies(ctx)
approvedPeersMap, err := c.integratedPeerValidator.GetValidatedPeers(ctx, account.Id, maps.Values(account.Groups), maps.Values(account.Peers), account.Settings.Extra)
if err != nil {
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
postureChecks, err := c.getPeerPostureChecks(account, peerID)
startPosture := time.Now()
postureChecks, err := c.getPeerPostureChecks(account, peer.ID)
if err != nil {
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
log.WithContext(ctx).Debugf("getPeerPostureChecks took %s", time.Since(startPosture))
accountZones, err := c.repo.GetAccountZones(ctx, account.Id)
if err != nil {
log.WithContext(ctx).Errorf("failed to get account zones: %v", err)
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
dnsDomain := c.GetDNSDomain(account.Settings)
peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
proxyNetworkMaps, err := c.proxyController.GetProxyNetworkMaps(ctx, account.Id, peerID, account.Peers)
proxyNetworkMaps, err := c.proxyController.GetProxyNetworkMaps(ctx, account.Id, peer.ID, account.Peers)
if err != nil {
log.WithContext(ctx).Errorf("failed to get proxy network maps: %v", err)
return nil, nil, 0, err
return nil, nil, nil, 0, err
}
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()
networkMap := account.GetPeerNetworkMapFromComponents(ctx, peerID, peersCustomZone, accountZones, approvedPeersMap, resourcePolicies, routers, c.accountManagerMetrics, groupIDToUserIDs)
networkMap := account.GetPeerNetworkMapFromComponents(ctx, peer.ID, peersCustomZone, accountZones, approvedPeersMap, resourcePolicies, routers, c.accountManagerMetrics, groupIDToUserIDs)
proxyNetworkMap, ok := proxyNetworkMaps[peerID]
proxyNetworkMap, ok := proxyNetworkMaps[peer.ID]
if ok {
networkMap.Merge(proxyNetworkMap)
}
dnsFwdPort := computeForwarderPort(maps.Values(account.Peers), network_map.DnsForwarderPortMinVersion)
return networkMap, postureChecks, dnsFwdPort, nil
return peer, networkMap, postureChecks, dnsFwdPort, nil
}
// GetDNSDomain returns the configured dnsDomain
@@ -806,24 +578,21 @@ func isPeerInPolicySourceGroups(account *types.Account, peerID string, policy *t
return false, nil
}
func (c *Controller) OnPeersUpdated(ctx context.Context, accountID string, peerIDs []string, affectedPeerIDs []string) error {
if len(affectedPeerIDs) == 0 {
log.WithContext(ctx).Tracef("no affected peers for peer update in account %s, skipping", accountID)
return nil
func (c *Controller) OnPeersUpdated(ctx context.Context, accountID string, peerIDs []string) error {
err := c.bufferSendUpdateAccountPeers(ctx, accountID, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationUpdate})
if err != nil {
log.WithContext(ctx).Errorf("failed to buffer update account peers for peer update in account %s: %v", accountID, err)
}
return c.BufferUpdateAffectedPeers(ctx, accountID, affectedPeerIDs, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationUpdate})
return nil
}
func (c *Controller) OnPeersAdded(ctx context.Context, accountID string, peerIDs []string, affectedPeerIDs []string) error {
func (c *Controller) OnPeersAdded(ctx context.Context, accountID string, peerIDs []string) error {
log.WithContext(ctx).Debugf("OnPeersAdded call to add peers: %v", peerIDs)
if len(affectedPeerIDs) == 0 {
log.WithContext(ctx).Tracef("no affected peers for peer add in account %s, skipping", accountID)
return nil
}
return c.BufferUpdateAffectedPeers(ctx, accountID, affectedPeerIDs, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationCreate})
return c.bufferSendUpdateAccountPeers(ctx, accountID, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationCreate})
}
func (c *Controller) OnPeersDeleted(ctx context.Context, accountID string, peerIDs []string, affectedPeerIDs []string) error {
func (c *Controller) OnPeersDeleted(ctx context.Context, accountID string, peerIDs []string) error {
network, err := c.repo.GetAccountNetwork(ctx, accountID)
if err != nil {
return err
@@ -856,11 +625,7 @@ func (c *Controller) OnPeersDeleted(ctx context.Context, accountID string, peerI
c.peersUpdateManager.CloseChannel(ctx, peerID)
}
if len(affectedPeerIDs) == 0 {
log.WithContext(ctx).Tracef("no affected peers for peer delete in account %s, skipping", accountID)
return nil
}
return c.BufferUpdateAffectedPeers(ctx, accountID, affectedPeerIDs, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationDelete})
return c.bufferSendUpdateAccountPeers(ctx, accountID, types.UpdateReason{Resource: types.UpdateResourcePeer, Operation: types.UpdateOperationDelete})
}
// GetNetworkMap returns Network map for a given peer (omits original peer from the Peers result)
@@ -900,7 +665,7 @@ func (c *Controller) GetNetworkMap(ctx context.Context, peerID string) (*types.N
return nil, err
}
c.injectAllProxyPolicies(ctx, account)
account.InjectProxyPolicies(ctx)
resourcePolicies := account.GetResourcePoliciesMap()
routers := account.GetResourceRoutersMap()
groupIDToUserIDs := account.GetActiveGroupUsers()

Some files were not shown because too many files have changed in this diff Show More