Anpassung von Prometheus, Grafana und Backend auf Anomalieerkennung.
All checks were successful
release-tag / release-image (push) Successful in 2m20s

This commit is contained in:
2026-04-24 21:38:25 +02:00
parent cdee259fb1
commit 3447af8d44
5 changed files with 1502 additions and 32 deletions

View File

@@ -3,74 +3,743 @@
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"liveNow": false,
"panels": [
{
"type": "stat",
"title": "Active Agents",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"targets": [
{
"expr": "eventcollector_active_agents",
"refId": "A"
}
]
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto"
}
},
{
"type": "stat",
"title": "High Detections (5m)",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"title": "Events/s",
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"targets": [
{
"expr": "increase(eventcollector_detection_hits_total{severity=\"high\"}[5m])",
"expr": "sum(rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m]))",
"refId": "A"
}
]
],
"fieldConfig": {
"defaults": {
"unit": "eps",
"decimals": 2
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"textMode": "auto"
}
},
{
"type": "stat",
"title": "High Detections 5m",
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"targets": [
{
"expr": "sum(increase(eventcollector_detection_hits_total{severity=\"high\",rule=~\"$rule\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"textMode": "auto"
}
},
{
"type": "stat",
"title": "Baseline Max Z-Score",
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"targets": [
{
"expr": "max(eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 3 },
{ "color": "red", "value": 5 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"textMode": "auto"
}
},
{
"type": "stat",
"title": "Rule Errors 5m",
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"targets": [
{
"expr": "sum(increase(eventcollector_rule_errors_total{rule=~\"$rule\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"textMode": "auto"
}
},
{
"type": "stat",
"title": "DB Insert Failures 5m",
"datasource": "$datasource",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"targets": [
{
"expr": "increase(eventcollector_db_insert_failures_total[5m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"textMode": "auto"
}
},
{
"type": "timeseries",
"title": "HTTP Requests",
"title": "Ingested Events / Second by Channel",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"expr": "rate(eventcollector_http_requests_total[5m])",
"legendFormat": "{{path}} {{status}}",
"expr": "sum by (channel) (rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m]))",
"legendFormat": "{{channel}}",
"refId": "A"
}
]
],
"fieldConfig": {
"defaults": {
"unit": "eps",
"decimals": 2
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "Detection Hits",
"title": "Detection Hits by Rule / Severity",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "increase(eventcollector_detection_hits_total[5m])",
"legendFormat": "{{rule}} {{severity}}",
"expr": "sum by (rule,severity) (increase(eventcollector_detection_hits_total{rule=~\"$rule\",severity=~\"$severity\"}[5m]))",
"legendFormat": "{{rule}} / {{severity}}",
"refId": "A"
}
]
],
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "Baseline: Current Count vs Average",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"expr": "eventcollector_baseline_current_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}",
"legendFormat": "current {{host}} {{channel}} {{event_id}}",
"refId": "A"
},
{
"expr": "eventcollector_baseline_avg_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}",
"legendFormat": "avg {{host}} {{channel}} {{event_id}}",
"refId": "B"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 2
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "Ingested Events",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
"title": "Baseline Z-Score",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"expr": "rate(eventcollector_ingest_events_total[5m])",
"legendFormat": "{{channel}} {{event_id}}",
"expr": "eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"}",
"legendFormat": "{{host}}",
"refId": "A"
}
]
],
"fieldConfig": {
"defaults": {
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 3 },
{ "color": "red", "value": 5 }
]
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "bargauge",
"title": "Top Baseline Z-Scores",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 20 },
"targets": [
{
"expr": "topk(10, eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"})",
"legendFormat": "{{host}}",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"decimals": 2,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 3 },
{ "color": "red", "value": 5 }
]
}
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showUnfilled": true
}
},
{
"type": "bargauge",
"title": "Top EventIDs by Ingest Rate",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 20 },
"targets": [
{
"expr": "topk(15, sum by (channel,event_id) (rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m])))",
"legendFormat": "{{channel}} / {{event_id}}",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "eps",
"decimals": 2
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showUnfilled": true
}
},
{
"type": "bargauge",
"title": "Top Detection Rules 1h",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 },
"targets": [
{
"expr": "topk(15, sum by (rule,severity) (increase(eventcollector_detection_hits_total{rule=~\"$rule\",severity=~\"$severity\"}[1h])))",
"legendFormat": "{{rule}} / {{severity}}",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showUnfilled": true
}
},
{
"type": "timeseries",
"title": "HTTP Requests by Path / Status",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 },
"targets": [
{
"expr": "sum by (path,status) (rate(eventcollector_http_requests_total[5m]))",
"legendFormat": "{{path}} {{status}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"decimals": 2
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "HTTP Latency p95",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 },
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le,path) (rate(eventcollector_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "{{path}} p95",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"decimals": 3
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "DB Insert Transaction Latency p95",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 },
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(eventcollector_db_tx_duration_seconds_bucket[5m])))",
"legendFormat": "db tx p95",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"decimals": 3
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "timeseries",
"title": "DB Batch Size p95",
"datasource": "$datasource",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 },
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(eventcollector_db_batch_size_bucket[5m])))",
"legendFormat": "batch size p95",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
}
},
{
"type": "table",
"title": "Agent Last Seen",
"datasource": "$datasource",
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 44 },
"targets": [
{
"expr": "time() - eventcollector_agent_last_seen_unixtime{host=~\"$host\"}",
"legendFormat": "{{host}}",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"decimals": 0
},
"overrides": []
},
"options": {
"showHeader": true
}
},
{
"type": "table",
"title": "Baseline Samples",
"datasource": "$datasource",
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 44 },
"targets": [
{
"expr": "eventcollector_baseline_sample_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}",
"legendFormat": "{{host}} {{channel}} {{event_id}}",
"refId": "A",
"instant": true,
"format": "table"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0
},
"overrides": []
},
"options": {
"showHeader": true
}
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": ["siem"],
"templating": { "list": [] },
"tags": ["siem", "baseline", "ad"],
"templating": {
"list": [
{
"name": "datasource",
"type": "datasource",
"query": "prometheus",
"current": {},
"hide": 0,
"label": "Datasource"
},
{
"name": "host",
"type": "query",
"datasource": "$datasource",
"query": "label_values(eventcollector_agent_last_seen_unixtime, host)",
"refresh": 1,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"label": "Host"
},
{
"name": "channel",
"type": "query",
"datasource": "$datasource",
"query": "label_values(eventcollector_ingest_events_total, channel)",
"refresh": 1,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"label": "Channel"
},
{
"name": "event_id",
"type": "query",
"datasource": "$datasource",
"query": "label_values(eventcollector_ingest_events_total, event_id)",
"refresh": 1,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"label": "Event ID"
},
{
"name": "rule",
"type": "query",
"datasource": "$datasource",
"query": "label_values(eventcollector_detection_hits_total, rule)",
"refresh": 1,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"label": "Rule"
},
{
"name": "severity",
"type": "custom",
"query": "low,medium,high",
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"label": "Severity"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"title": "SIEM Overview",
"timezone": "browser",
"title": "SIEM Overview Extended",
"uid": "siem-overview-extended",
"version": 1
}

View File

@@ -89,14 +89,14 @@ CREATE TABLE IF NOT EXISTS detections (
CREATE TABLE detection_rules (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(128) NOT NULL UNIQUE,
name VARCHAR(255) NOT NULL UNIQUE,
description TEXT,
severity VARCHAR(16) NOT NULL DEFAULT 'medium',
channel VARCHAR(64) NOT NULL DEFAULT 'Security',
channel VARCHAR(255) NOT NULL DEFAULT 'Security',
event_ids VARCHAR(255) NOT NULL,
match_field VARCHAR(64) DEFAULT '',
match_field VARCHAR(255) DEFAULT '',
match_operator VARCHAR(16) DEFAULT '',
match_value TEXT,
@@ -1312,4 +1312,42 @@ ALTER TABLE detection_rules
MODIFY description TEXT NULL,
MODIFY match_value TEXT NULL,
MODIFY match_field VARCHAR(64) NOT NULL DEFAULT '',
MODIFY match_operator VARCHAR(16) NOT NULL DEFAULT '';
MODIFY match_operator VARCHAR(16) NOT NULL DEFAULT '';
CREATE TABLE baseline_event_stats (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
hostname VARCHAR(255) NOT NULL,
channel_name VARCHAR(255) NOT NULL,
event_id INT NOT NULL,
hour_of_day TINYINT NOT NULL,
day_of_week TINYINT NOT NULL,
avg_count DOUBLE NOT NULL DEFAULT 0,
m2_count DOUBLE NOT NULL DEFAULT 0,
stddev_count DOUBLE NOT NULL DEFAULT 0,
sample_count INT NOT NULL DEFAULT 0,
last_updated TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP(6),
UNIQUE KEY uniq_baseline_event (
hostname,
channel_name,
event_id,
hour_of_day,
day_of_week
)
);
CREATE INDEX idx_baseline_event_lookup
ON baseline_event_stats (
hostname,
channel_name,
event_id,
hour_of_day,
day_of_week,
sample_count
);

View File

@@ -1,5 +1,5 @@
groups:
- name: siem-backend
- name: siem-backend-availability
rules:
- alert: SiemBackendDown
expr: up{job="siem-backend"} == 0
@@ -10,6 +10,26 @@ groups:
summary: "SIEM backend nicht erreichbar"
description: "Prometheus kann das SIEM-Backend seit mindestens 2 Minuten nicht scrapen."
- alert: SiemNoIngestEvents
expr: sum(rate(eventcollector_ingest_events_total[15m])) == 0
for: 15m
labels:
severity: warning
annotations:
summary: "Keine eingehenden SIEM Events"
description: "Seit mindestens 15 Minuten wurden keine Events mehr ingestiert."
- alert: SiemTooFewActiveAgents
expr: eventcollector_active_agents < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Zu wenige aktive Agents"
description: "Es wurden weniger aktive Agents erkannt als erwartet."
- name: siem-backend-detections
rules:
- alert: SiemHighDetections
expr: increase(eventcollector_detection_hits_total{severity="high"}[5m]) > 0
for: 1m
@@ -19,6 +39,33 @@ groups:
summary: "Neue High-Severity Detection"
description: "Es wurde mindestens eine neue High-Severity-Detection in den letzten 5 Minuten erzeugt."
- alert: SiemManyMediumDetections
expr: sum(increase(eventcollector_detection_hits_total{severity="medium"}[15m])) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "Viele Medium-Detections"
description: "Es wurden mehr als 10 Medium-Detections in 15 Minuten erzeugt."
- alert: SiemBaselineHighAnomaly
expr: eventcollector_anomaly_score{rule="baseline_event_rate_anomaly"} >= 5
for: 2m
labels:
severity: high
annotations:
summary: "Hohe Baseline-Anomalie"
description: "Host {{ $labels.host }} hat einen hohen Baseline-Z-Score: {{ $value }}."
- alert: SiemBaselineMediumAnomaly
expr: eventcollector_anomaly_score{rule="baseline_event_rate_anomaly"} >= 3
for: 5m
labels:
severity: warning
annotations:
summary: "Baseline-Anomalie"
description: "Host {{ $labels.host }} hat einen erhöhten Baseline-Z-Score: {{ $value }}."
- alert: SiemRuleErrors
expr: increase(eventcollector_rule_errors_total[5m]) > 0
for: 1m
@@ -28,11 +75,51 @@ groups:
summary: "Fehler in Detection-Regeln"
description: "Mindestens eine Detection-Regel hat in den letzten 5 Minuten einen Fehler erzeugt."
- alert: SiemTooFewActiveAgents
expr: eventcollector_active_agents < 1
- name: siem-backend-ingest
rules:
- alert: SiemIngestRejected
expr: sum(increase(eventcollector_ingest_rejected_total[5m])) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Ingest Requests abgelehnt"
description: "In den letzten 5 Minuten wurden Ingest Requests abgelehnt."
- alert: SiemDBInsertFailures
expr: increase(eventcollector_db_insert_failures_total[5m]) > 0
for: 1m
labels:
severity: high
annotations:
summary: "DB Insert Fehler"
description: "Das SIEM-Backend konnte Events nicht in die Datenbank schreiben."
- alert: SiemHighIngestRate
expr: sum(rate(eventcollector_ingest_events_total[5m])) > 500
for: 5m
labels:
severity: warning
annotations:
summary: "Zu wenige aktive Agents"
description: "Es wurden weniger aktive Agents erkannt als erwartet."
summary: "Sehr hohe Eventrate"
description: "Die Eventrate liegt seit 5 Minuten über 500 Events/s."
- name: siem-backend-baseline
rules:
- alert: SiemBaselineNotEnoughSamples
expr: eventcollector_baseline_sample_count > 0 and eventcollector_baseline_sample_count < 24
for: 30m
labels:
severity: info
annotations:
summary: "Baseline lernt noch"
description: "Für {{ $labels.host }} / {{ $labels.channel }} / {{ $labels.event_id }} gibt es erst {{ $value }} Samples."
- alert: SiemBaselineCurrentFarAboveAverage
expr: eventcollector_baseline_avg_count > 0 and (eventcollector_baseline_current_count / eventcollector_baseline_avg_count) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "Eventrate deutlich über Baseline"
description: "{{ $labels.host }} / {{ $labels.channel }} / {{ $labels.event_id }} liegt mehr als 10x über Durchschnitt."

16
dot_env
View File

@@ -36,4 +36,18 @@ MARIADB_ROOT_PASSWORD=ROOTPASSWORT
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=admin
ENROLLMENT_KEY=BITTE_SEHR_LANG_UND_ZUFAELLIG
ENROLLMENT_KEY=BITTE_SEHR_LANG_UND_ZUFAELLIG
BASELINE_ENABLED=true
BASELINE_WINDOW=5m
BASELINE_MIN_SAMPLES=24
BASELINE_MIN_COUNT=10
BASELINE_MEDIUM_Z=2.5
BASELINE_HIGH_Z=4.0
BASELINE_SUPPRESS_FOR=1h
#BASELINE_MIN_SAMPLES=84
#BASELINE_MEDIUM_Z=3.0
#BASELINE_HIGH_Z=5.0
#BASELINE_MIN_COUNT=20

662
main.go
View File

@@ -85,6 +85,7 @@ const uiTemplates = `
<a href="/ui">Dashboard</a>
<a href="/ui/agents">Agents</a>
<a href="/ui/rules">Rules</a>
<a href="/ui/baseline">Baseline</a>
<a href="/ui/detections">Detections</a>
<a href="/ui/events">Events</a>
<a href="/metrics">Metrics</a>
@@ -246,6 +247,53 @@ const uiTemplates = `
{{template "footer" .}}
{{end}}
{{define "baseline"}}
{{template "header" .}}
<h1>{{.Title}}</h1>
<p class="muted">Baseline-Anomalien aus der Regel <strong>baseline_event_rate_anomaly</strong>.</p>
<form method="get" action="/ui/baseline">
<div class="filters">
<div><label>Host</label><input name="host" value="{{index .Filters "host"}}"></div>
<div><label>Channel</label><input name="channel" value="{{index .Filters "channel"}}"></div>
<div><label>Event ID</label><input name="event_id" value="{{index .Filters "event_id"}}"></div>
<div><label>Severity</label><input name="severity" value="{{index .Filters "severity"}}"></div>
<div><label>Limit</label><input name="limit" value="{{index .Filters "limit"}}"></div>
</div>
<button type="submit">Filtern</button>
</form>
<table>
<tr>
<th>Zeit</th>
<th>Host</th>
<th>Channel</th>
<th>EventID</th>
<th>Severity</th>
<th>Aktuell</th>
<th>Baseline</th>
<th>Z-Score</th>
<th>Samples</th>
<th>Bucket</th>
</tr>
{{range .Anomalies}}
<tr>
<td>{{fmtTime .CreatedAt}}</td>
<td>{{.Hostname}}</td>
<td>{{.Channel}}</td>
<td><a href="/ui/events?host={{q .Hostname}}&channel={{q .Channel}}&event_id={{.EventID}}">{{.EventID}}</a></td>
<td class="sev-{{.Severity}}">{{.Severity}}</td>
<td><strong>{{.Count}}</strong></td>
<td>{{printf "%.2f" .AvgCount}} ± {{printf "%.2f" .StddevCount}}</td>
<td><strong>{{printf "%.2f" .ZScore}}</strong></td>
<td>{{.SampleCount}}</td>
<td>Tag {{.DayOfWeek}}, Stunde {{.HourOfDay}}</td>
</tr>
{{end}}
</table>
{{template "footer" .}}
{{end}}
{{define "events"}}
{{template "header" .}}
<h1>{{.Title}}</h1>
@@ -391,6 +439,14 @@ type Config struct {
DetectionsLimit int
EnrollmentKey string
BaselineEnabled bool
BaselineWindow time.Duration
BaselineMinSamples int
BaselineMinCount int
BaselineMediumZScore float64
BaselineHighZScore float64
BaselineSuppressFor time.Duration
}
type LogPayload struct {
@@ -468,6 +524,11 @@ type detector struct {
ruleLastRunGauge *prometheus.GaugeVec
ruleRuntimeHist *prometheus.HistogramVec
ruleErrorsTotal *prometheus.CounterVec
baselineCurrentCountGauge *prometheus.GaugeVec
baselineAverageGauge *prometheus.GaugeVec
baselineStddevGauge *prometheus.GaugeVec
baselineSamplesGauge *prometheus.GaugeVec
}
type EventRow struct {
@@ -574,6 +635,65 @@ type DynamicRulePageData struct {
Rules []DynamicRule
}
type BaselineBucket struct {
Hostname string
Channel string
EventID uint32
Hour int
DayOfWeek int
Count int
}
type BaselineStat struct {
AvgCount float64
M2Count float64
StddevCount float64
SampleCount int
}
type BaselineAnomalyRow struct {
ID uint64
CreatedAt time.Time
Hostname string
Channel string
EventID uint32
Severity string
Score float64
WindowStart time.Time
WindowEnd time.Time
Summary string
Count int
AvgCount float64
StddevCount float64
ZScore float64
SampleCount int
HourOfDay int
DayOfWeek int
WindowMin int
}
type BaselinePageData struct {
Title string
Now time.Time
Filters map[string]string
Anomalies []BaselineAnomalyRow
}
type baselineDetailsJSON struct {
Hostname string `json:"hostname"`
Channel string `json:"channel"`
EventID uint32 `json:"event_id"`
Count int `json:"count"`
AvgCount float64 `json:"avg_count"`
StddevCount float64 `json:"stddev_count"`
ZScore float64 `json:"z_score"`
SampleCount int `json:"sample_count"`
HourOfDay int `json:"hour_of_day"`
DayOfWeek int `json:"day_of_week"`
WindowMinutes int `json:"window_minutes"`
}
var (
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "eventcollector_http_requests_total", Help: "Total HTTP requests."},
@@ -671,6 +791,34 @@ func main() {
prometheus.CounterOpts{Name: "eventcollector_rule_errors_total", Help: "Rule execution errors."},
[]string{"rule"},
),
baselineCurrentCountGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eventcollector_baseline_current_count",
Help: "Current event count in baseline window.",
},
[]string{"host", "channel", "event_id"},
),
baselineAverageGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eventcollector_baseline_avg_count",
Help: "Baseline average event count.",
},
[]string{"host", "channel", "event_id"},
),
baselineStddevGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eventcollector_baseline_stddev_count",
Help: "Baseline standard deviation event count.",
},
[]string{"host", "channel", "event_id"},
),
baselineSamplesGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eventcollector_baseline_sample_count",
Help: "Baseline sample count.",
},
[]string{"host", "channel", "event_id"},
),
}
reg.MustRegister(
d.lastSeenGauge,
@@ -680,6 +828,10 @@ func main() {
d.ruleLastRunGauge,
d.ruleRuntimeHist,
d.ruleErrorsTotal,
d.baselineCurrentCountGauge,
d.baselineAverageGauge,
d.baselineStddevGauge,
d.baselineSamplesGauge,
)
s := &server{
@@ -726,6 +878,7 @@ func main() {
mux.HandleFunc("/ui/rules", s.handleUIRules)
mux.HandleFunc("/ui/rules/save", s.handleUIRuleSave)
mux.HandleFunc("/ui/rules/toggle", s.handleUIRuleToggle)
mux.HandleFunc("/ui/baseline", s.handleUIBaseline)
httpSrv := &http.Server{
Addr: cfg.ListenAddr,
@@ -759,6 +912,142 @@ func main() {
}
}
func (s *server) listBaselineAnomalies(ctx context.Context, host, channel, severity string, eventID uint32, limit int) ([]BaselineAnomalyRow, error) {
if limit <= 0 || limit > 1000 {
limit = 100
}
query := `
SELECT id, severity, hostname, channel_name, event_id, score,
window_start, window_end, summary, details_json, created_at
FROM detections
WHERE rule_name = 'baseline_event_rate_anomaly'
`
args := make([]any, 0, 8)
if host != "" {
query += ` AND hostname = ?`
args = append(args, host)
}
if channel != "" {
query += ` AND channel_name = ?`
args = append(args, channel)
}
if eventID != 0 {
query += ` AND event_id = ?`
args = append(args, eventID)
}
if severity != "" {
query += ` AND severity = ?`
args = append(args, severity)
}
query += ` ORDER BY created_at DESC LIMIT ?`
args = append(args, limit)
rows, err := s.db.QueryContext(ctx, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
out := make([]BaselineAnomalyRow, 0)
for rows.Next() {
var row BaselineAnomalyRow
var detailsRaw []byte
if err := rows.Scan(
&row.ID,
&row.Severity,
&row.Hostname,
&row.Channel,
&row.EventID,
&row.Score,
&row.WindowStart,
&row.WindowEnd,
&row.Summary,
&detailsRaw,
&row.CreatedAt,
); err != nil {
return nil, err
}
var details baselineDetailsJSON
if err := json.Unmarshal(detailsRaw, &details); err == nil {
row.Count = details.Count
row.AvgCount = details.AvgCount
row.StddevCount = details.StddevCount
row.ZScore = details.ZScore
row.SampleCount = details.SampleCount
row.HourOfDay = details.HourOfDay
row.DayOfWeek = details.DayOfWeek
row.WindowMin = details.WindowMinutes
} else {
row.ZScore = row.Score
}
out = append(out, row)
}
return out, rows.Err()
}
func (s *server) handleUIBaseline(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
return
}
filters := map[string]string{
"host": strings.TrimSpace(r.URL.Query().Get("host")),
"channel": strings.TrimSpace(r.URL.Query().Get("channel")),
"event_id": strings.TrimSpace(r.URL.Query().Get("event_id")),
"severity": strings.TrimSpace(r.URL.Query().Get("severity")),
"limit": strings.TrimSpace(r.URL.Query().Get("limit")),
}
limit := 100
if filters["limit"] != "" {
if n, err := strconv.Atoi(filters["limit"]); err == nil && n > 0 && n <= 1000 {
limit = n
}
}
var eventID uint32
if filters["event_id"] != "" {
if n, err := strconv.ParseUint(filters["event_id"], 10, 32); err == nil {
eventID = uint32(n)
}
}
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
items, err := s.listBaselineAnomalies(
ctx,
filters["host"],
filters["channel"],
filters["severity"],
eventID,
limit,
)
if err != nil {
s.logger.Printf("ui baseline: %v", err)
writeError(w, http.StatusInternalServerError, "internal error")
return
}
data := BaselinePageData{
Title: "Baseline-Anomalien",
Now: time.Now(),
Filters: filters,
Anomalies: items,
}
s.renderTemplate(w, "baseline", data)
}
func (s *server) listDynamicRules(ctx context.Context) ([]DynamicRule, error) {
const q = `
SELECT id,
@@ -1457,9 +1746,48 @@ func loadConfig() Config {
DetectionsLimit: getenvInt("DETECTIONS_LIMIT", 100),
EnrollmentKey: mustGetenv("ENROLLMENT_KEY"),
BaselineEnabled: getenvBool("BASELINE_ENABLED", true),
BaselineWindow: getenvDuration("BASELINE_WINDOW", 5*time.Minute),
BaselineMinSamples: getenvInt("BASELINE_MIN_SAMPLES", 24),
BaselineMinCount: getenvInt("BASELINE_MIN_COUNT", 10),
BaselineMediumZScore: getenvFloat("BASELINE_MEDIUM_Z", 2.5),
BaselineHighZScore: getenvFloat("BASELINE_HIGH_Z", 4.0),
BaselineSuppressFor: getenvDuration("BASELINE_SUPPRESS_FOR", 1*time.Hour),
}
}
func getenvBool(key string, def bool) bool {
v := strings.TrimSpace(os.Getenv(key))
if v == "" {
return def
}
switch strings.ToLower(v) {
case "1", "true", "yes", "y", "on":
return true
case "0", "false", "no", "n", "off":
return false
default:
log.Fatalf("invalid bool for %s: %s", key, v)
return def
}
}
func getenvFloat(key string, def float64) float64 {
v := strings.TrimSpace(os.Getenv(key))
if v == "" {
return def
}
f, err := strconv.ParseFloat(v, 64)
if err != nil {
log.Fatalf("invalid float for %s: %v", key, err)
}
return f
}
func (s *server) handleHealthz(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
ingestRejectedTotal.WithLabelValues("method_not_allowed").Inc()
@@ -1826,6 +2154,337 @@ func (s *server) runDetectionLoop() {
}
}
func (d *detector) runBaselineUpdate(ctx context.Context) error {
if !d.cfg.BaselineEnabled {
return nil
}
windowEnd := time.Now().UTC()
windowStart := windowEnd.Add(-d.cfg.BaselineWindow)
rows, err := d.db.QueryContext(ctx, `
SELECT
hostname,
channel_name,
event_id,
HOUR(ts) AS hour_of_day,
WEEKDAY(ts) AS day_of_week,
COUNT(*) AS cnt
FROM event_logs
WHERE ts >= ? AND ts < ?
GROUP BY hostname, channel_name, event_id, HOUR(ts), WEEKDAY(ts)
`, windowStart, windowEnd)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var b BaselineBucket
if err := rows.Scan(
&b.Hostname,
&b.Channel,
&b.EventID,
&b.Hour,
&b.DayOfWeek,
&b.Count,
); err != nil {
return err
}
if err := d.updateBaselineBucket(ctx, b); err != nil {
return err
}
}
return rows.Err()
}
func (d *detector) updateBaselineBucket(ctx context.Context, b BaselineBucket) error {
tx, err := d.db.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
var stat BaselineStat
err = tx.QueryRowContext(ctx, `
SELECT avg_count, m2_count, stddev_count, sample_count
FROM baseline_event_stats
WHERE hostname = ?
AND channel_name = ?
AND event_id = ?
AND hour_of_day = ?
AND day_of_week = ?
FOR UPDATE
`,
b.Hostname,
b.Channel,
b.EventID,
b.Hour,
b.DayOfWeek,
).Scan(
&stat.AvgCount,
&stat.M2Count,
&stat.StddevCount,
&stat.SampleCount,
)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
return err
}
x := float64(b.Count)
if errors.Is(err, sql.ErrNoRows) {
_, err := tx.ExecContext(ctx, `
INSERT INTO baseline_event_stats
(hostname, channel_name, event_id, hour_of_day, day_of_week,
avg_count, m2_count, stddev_count, sample_count)
VALUES (?, ?, ?, ?, ?, ?, 0, 0, 1)
`,
b.Hostname,
b.Channel,
b.EventID,
b.Hour,
b.DayOfWeek,
x,
)
if err != nil {
return err
}
return tx.Commit()
}
newSamples := stat.SampleCount + 1
delta := x - stat.AvgCount
newAvg := stat.AvgCount + delta/float64(newSamples)
delta2 := x - newAvg
newM2 := stat.M2Count + delta*delta2
newStddev := 0.0
if newSamples > 1 {
newStddev = math.Sqrt(newM2 / float64(newSamples-1))
}
_, err = tx.ExecContext(ctx, `
UPDATE baseline_event_stats
SET avg_count = ?,
m2_count = ?,
stddev_count = ?,
sample_count = ?,
last_updated = CURRENT_TIMESTAMP(6)
WHERE hostname = ?
AND channel_name = ?
AND event_id = ?
AND hour_of_day = ?
AND day_of_week = ?
`,
newAvg,
newM2,
newStddev,
newSamples,
b.Hostname,
b.Channel,
b.EventID,
b.Hour,
b.DayOfWeek,
)
if err != nil {
return err
}
return tx.Commit()
}
func (d *detector) runBaselineAnomalyRule(ctx context.Context) error {
if !d.cfg.BaselineEnabled {
return nil
}
windowEnd := time.Now().UTC()
windowStart := windowEnd.Add(-d.cfg.BaselineWindow)
rows, err := d.db.QueryContext(ctx, `
SELECT
e.hostname,
e.channel_name,
e.event_id,
HOUR(e.ts) AS hour_of_day,
WEEKDAY(e.ts) AS day_of_week,
COUNT(*) AS cnt,
b.avg_count,
b.stddev_count,
b.sample_count
FROM event_logs e
JOIN baseline_event_stats b
ON b.hostname = e.hostname
AND b.channel_name = e.channel_name
AND b.event_id = e.event_id
AND b.hour_of_day = HOUR(e.ts)
AND b.day_of_week = WEEKDAY(e.ts)
WHERE e.ts >= ? AND e.ts < ?
GROUP BY
e.hostname,
e.channel_name,
e.event_id,
HOUR(e.ts),
WEEKDAY(e.ts),
b.avg_count,
b.stddev_count,
b.sample_count
`, windowStart, windowEnd)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var host string
var channel string
var eventID uint32
var hour int
var dayOfWeek int
var count int
var avg float64
var stddev float64
var samples int
if err := rows.Scan(
&host,
&channel,
&eventID,
&hour,
&dayOfWeek,
&count,
&avg,
&stddev,
&samples,
); err != nil {
return err
}
eventIDStr := strconv.Itoa(int(eventID))
d.baselineCurrentCountGauge.WithLabelValues(host, channel, eventIDStr).Set(float64(count))
d.baselineAverageGauge.WithLabelValues(host, channel, eventIDStr).Set(avg)
d.baselineStddevGauge.WithLabelValues(host, channel, eventIDStr).Set(stddev)
d.baselineSamplesGauge.WithLabelValues(host, channel, eventIDStr).Set(float64(samples))
if samples < d.cfg.BaselineMinSamples {
continue
}
if count < d.cfg.BaselineMinCount {
continue
}
if stddev <= 0 {
continue
}
z := (float64(count) - avg) / stddev
if z < d.cfg.BaselineMediumZScore {
continue
}
severity := "medium"
if z >= d.cfg.BaselineHighZScore {
severity = "high"
}
suppressed, err := d.isBaselineSuppressed(ctx, host, channel, eventID, windowEnd)
if err != nil {
return err
}
if suppressed {
continue
}
score := z
created, err := d.insertDetection(ctx, Detection{
RuleName: "baseline_event_rate_anomaly",
Severity: severity,
Hostname: host,
Channel: channel,
EventID: eventID,
Score: score,
WindowStart: windowStart,
WindowEnd: windowEnd,
Summary: fmt.Sprintf(
"Baseline-Anomalie auf %s: %s EventID %d kam %d-mal in %d Minuten, normal %.2f ± %.2f, z=%.2f",
host,
channel,
eventID,
count,
int(d.cfg.BaselineWindow.Minutes()),
avg,
stddev,
z,
),
Details: mustJSON(map[string]any{
"hostname": host,
"channel": channel,
"event_id": eventID,
"count": count,
"avg_count": avg,
"stddev_count": stddev,
"z_score": z,
"sample_count": samples,
"hour_of_day": hour,
"day_of_week": dayOfWeek,
"window_minutes": int(d.cfg.BaselineWindow.Minutes()),
"min_samples": d.cfg.BaselineMinSamples,
"medium_z": d.cfg.BaselineMediumZScore,
"high_z": d.cfg.BaselineHighZScore,
}),
})
if err != nil {
return err
}
if created {
d.detectionHitsTotal.WithLabelValues("baseline_event_rate_anomaly", severity).Inc()
d.anomalyScoreGauge.WithLabelValues(host, "baseline_event_rate_anomaly").Set(score)
}
}
return rows.Err()
}
func (d *detector) isBaselineSuppressed(ctx context.Context, hostname, channel string, eventID uint32, now time.Time) (bool, error) {
if d.cfg.BaselineSuppressFor <= 0 {
return false, nil
}
since := now.UTC().Add(-d.cfg.BaselineSuppressFor)
var count int
err := d.db.QueryRowContext(ctx, `
SELECT COUNT(*)
FROM detections
WHERE rule_name = 'baseline_event_rate_anomaly'
AND hostname = ?
AND channel_name = ?
AND event_id = ?
AND created_at >= ?
`,
hostname,
channel,
eventID,
since,
).Scan(&count)
if err != nil {
return false, err
}
return count > 0, nil
}
func (d *detector) runDynamicRules(ctx context.Context) error {
rows, err := d.db.QueryContext(ctx, `
SELECT id, name, description, severity, channel, event_ids,
@@ -2287,6 +2946,9 @@ func (s *server) runDetectionsOnce() {
{"success_after_failures", s.detector.runSuccessAfterFailuresRule},
{"new_source_ip_for_user", s.detector.runNewSourceIPForUserRule},
{"dynamic_rules", s.detector.runDynamicRules},
{"baseline_anomaly", s.detector.runBaselineAnomalyRule},
{"baseline_update", s.detector.runBaselineUpdate},
}
for _, rule := range rules {