From 3447af8d44842a1905b07157ab787fb9c18edc40 Mon Sep 17 00:00:00 2001 From: jbergner Date: Fri, 24 Apr 2026 21:38:25 +0200 Subject: [PATCH] Anpassung von Prometheus, Grafana und Backend auf Anomalieerkennung. --- .../dashboards/siem-overview.json | 713 +++++++++++++++++- deploy/mariadb/init/001-schema.sql | 46 +- deploy/prometheus/rules/siem-alerts.yml | 97 ++- dot_env | 16 +- main.go | 662 ++++++++++++++++ 5 files changed, 1502 insertions(+), 32 deletions(-) diff --git a/deploy/grafana/provisioning/dashboards/siem-overview.json b/deploy/grafana/provisioning/dashboards/siem-overview.json index 433b4f5..a0b0bfe 100644 --- a/deploy/grafana/provisioning/dashboards/siem-overview.json +++ b/deploy/grafana/provisioning/dashboards/siem-overview.json @@ -3,74 +3,743 @@ "list": [] }, "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, "panels": [ { "type": "stat", "title": "Active Agents", - "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, "targets": [ { "expr": "eventcollector_active_agents", "refId": "A" } - ] + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + } }, { "type": "stat", - "title": "High Detections (5m)", - "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "title": "Events/s", + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, "targets": [ { - "expr": "increase(eventcollector_detection_hits_total{severity=\"high\"}[5m])", + "expr": "sum(rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m]))", "refId": "A" } - ] + ], + "fieldConfig": { + "defaults": { + "unit": "eps", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "auto" + } }, + { + "type": "stat", + "title": "High Detections 5m", + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "targets": [ + { + "expr": "sum(increase(eventcollector_detection_hits_total{severity=\"high\",rule=~\"$rule\"}[5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "auto" + } + }, + { + "type": "stat", + "title": "Baseline Max Z-Score", + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "targets": [ + { + "expr": "max(eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 3 }, + { "color": "red", "value": 5 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "auto" + } + }, + { + "type": "stat", + "title": "Rule Errors 5m", + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 }, + "targets": [ + { + "expr": "sum(increase(eventcollector_rule_errors_total{rule=~\"$rule\"}[5m]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "auto" + } + }, + { + "type": "stat", + "title": "DB Insert Failures 5m", + "datasource": "$datasource", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 }, + "targets": [ + { + "expr": "increase(eventcollector_db_insert_failures_total[5m])", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "auto" + } + }, + { "type": "timeseries", - "title": "HTTP Requests", + "title": "Ingested Events / Second by Channel", + "datasource": "$datasource", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "targets": [ { - "expr": "rate(eventcollector_http_requests_total[5m])", - "legendFormat": "{{path}} {{status}}", + "expr": "sum by (channel) (rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m]))", + "legendFormat": "{{channel}}", "refId": "A" } - ] + ], + "fieldConfig": { + "defaults": { + "unit": "eps", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } }, { "type": "timeseries", - "title": "Detection Hits", + "title": "Detection Hits by Rule / Severity", + "datasource": "$datasource", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "targets": [ { - "expr": "increase(eventcollector_detection_hits_total[5m])", - "legendFormat": "{{rule}} {{severity}}", + "expr": "sum by (rule,severity) (increase(eventcollector_detection_hits_total{rule=~\"$rule\",severity=~\"$severity\"}[5m]))", + "legendFormat": "{{rule}} / {{severity}}", "refId": "A" } - ] + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + + { + "type": "timeseries", + "title": "Baseline: Current Count vs Average", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "targets": [ + { + "expr": "eventcollector_baseline_current_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}", + "legendFormat": "current {{host}} {{channel}} {{event_id}}", + "refId": "A" + }, + { + "expr": "eventcollector_baseline_avg_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}", + "legendFormat": "avg {{host}} {{channel}} {{event_id}}", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } }, { "type": "timeseries", - "title": "Ingested Events", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, + "title": "Baseline Z-Score", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "targets": [ { - "expr": "rate(eventcollector_ingest_events_total[5m])", - "legendFormat": "{{channel}} {{event_id}}", + "expr": "eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"}", + "legendFormat": "{{host}}", "refId": "A" } - ] + ], + "fieldConfig": { + "defaults": { + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 3 }, + { "color": "red", "value": 5 } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + + { + "type": "bargauge", + "title": "Top Baseline Z-Scores", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 20 }, + "targets": [ + { + "expr": "topk(10, eventcollector_anomaly_score{host=~\"$host\",rule=\"baseline_event_rate_anomaly\"})", + "legendFormat": "{{host}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 3 }, + { "color": "red", "value": 5 } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": true + } + }, + { + "type": "bargauge", + "title": "Top EventIDs by Ingest Rate", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 20 }, + "targets": [ + { + "expr": "topk(15, sum by (channel,event_id) (rate(eventcollector_ingest_events_total{channel=~\"$channel\",event_id=~\"$event_id\"}[5m])))", + "legendFormat": "{{channel}} / {{event_id}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "eps", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": true + } + }, + { + "type": "bargauge", + "title": "Top Detection Rules 1h", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }, + "targets": [ + { + "expr": "topk(15, sum by (rule,severity) (increase(eventcollector_detection_hits_total{rule=~\"$rule\",severity=~\"$severity\"}[1h])))", + "legendFormat": "{{rule}} / {{severity}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": true + } + }, + + { + "type": "timeseries", + "title": "HTTP Requests by Path / Status", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, + "targets": [ + { + "expr": "sum by (path,status) (rate(eventcollector_http_requests_total[5m]))", + "legendFormat": "{{path}} {{status}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + { + "type": "timeseries", + "title": "HTTP Latency p95", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le,path) (rate(eventcollector_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "{{path}} p95", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "decimals": 3 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + + { + "type": "timeseries", + "title": "DB Insert Transaction Latency p95", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(eventcollector_db_tx_duration_seconds_bucket[5m])))", + "legendFormat": "db tx p95", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "decimals": 3 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + { + "type": "timeseries", + "title": "DB Batch Size p95", + "datasource": "$datasource", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(eventcollector_db_batch_size_bucket[5m])))", + "legendFormat": "batch size p95", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + + { + "type": "table", + "title": "Agent Last Seen", + "datasource": "$datasource", + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 44 }, + "targets": [ + { + "expr": "time() - eventcollector_agent_last_seen_unixtime{host=~\"$host\"}", + "legendFormat": "{{host}}", + "refId": "A", + "instant": true, + "format": "table" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true + } + }, + { + "type": "table", + "title": "Baseline Samples", + "datasource": "$datasource", + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 44 }, + "targets": [ + { + "expr": "eventcollector_baseline_sample_count{host=~\"$host\",channel=~\"$channel\",event_id=~\"$event_id\"}", + "legendFormat": "{{host}} {{channel}} {{event_id}}", + "refId": "A", + "instant": true, + "format": "table" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true + } } ], + "refresh": "30s", "schemaVersion": 39, "style": "dark", - "tags": ["siem"], - "templating": { "list": [] }, + "tags": ["siem", "baseline", "ad"], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": {}, + "hide": 0, + "label": "Datasource" + }, + { + "name": "host", + "type": "query", + "datasource": "$datasource", + "query": "label_values(eventcollector_agent_last_seen_unixtime, host)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "label": "Host" + }, + { + "name": "channel", + "type": "query", + "datasource": "$datasource", + "query": "label_values(eventcollector_ingest_events_total, channel)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "label": "Channel" + }, + { + "name": "event_id", + "type": "query", + "datasource": "$datasource", + "query": "label_values(eventcollector_ingest_events_total, event_id)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "label": "Event ID" + }, + { + "name": "rule", + "type": "query", + "datasource": "$datasource", + "query": "label_values(eventcollector_detection_hits_total, rule)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "label": "Rule" + }, + { + "name": "severity", + "type": "custom", + "query": "low,medium,high", + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "label": "Severity" + } + ] + }, "time": { "from": "now-6h", "to": "now" }, - "title": "SIEM Overview", + "timezone": "browser", + "title": "SIEM Overview Extended", + "uid": "siem-overview-extended", "version": 1 } \ No newline at end of file diff --git a/deploy/mariadb/init/001-schema.sql b/deploy/mariadb/init/001-schema.sql index 0120a6c..ef06ea0 100644 --- a/deploy/mariadb/init/001-schema.sql +++ b/deploy/mariadb/init/001-schema.sql @@ -89,14 +89,14 @@ CREATE TABLE IF NOT EXISTS detections ( CREATE TABLE detection_rules ( id BIGINT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(128) NOT NULL UNIQUE, + name VARCHAR(255) NOT NULL UNIQUE, description TEXT, severity VARCHAR(16) NOT NULL DEFAULT 'medium', - channel VARCHAR(64) NOT NULL DEFAULT 'Security', + channel VARCHAR(255) NOT NULL DEFAULT 'Security', event_ids VARCHAR(255) NOT NULL, - match_field VARCHAR(64) DEFAULT '', + match_field VARCHAR(255) DEFAULT '', match_operator VARCHAR(16) DEFAULT '', match_value TEXT, @@ -1312,4 +1312,42 @@ ALTER TABLE detection_rules MODIFY description TEXT NULL, MODIFY match_value TEXT NULL, MODIFY match_field VARCHAR(64) NOT NULL DEFAULT '', -MODIFY match_operator VARCHAR(16) NOT NULL DEFAULT ''; \ No newline at end of file +MODIFY match_operator VARCHAR(16) NOT NULL DEFAULT ''; + + + +CREATE TABLE baseline_event_stats ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + + hostname VARCHAR(255) NOT NULL, + channel_name VARCHAR(255) NOT NULL, + event_id INT NOT NULL, + + hour_of_day TINYINT NOT NULL, + day_of_week TINYINT NOT NULL, + + avg_count DOUBLE NOT NULL DEFAULT 0, + m2_count DOUBLE NOT NULL DEFAULT 0, + stddev_count DOUBLE NOT NULL DEFAULT 0, + sample_count INT NOT NULL DEFAULT 0, + + last_updated TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP(6), + + UNIQUE KEY uniq_baseline_event ( + hostname, + channel_name, + event_id, + hour_of_day, + day_of_week + ) +); + +CREATE INDEX idx_baseline_event_lookup +ON baseline_event_stats ( + hostname, + channel_name, + event_id, + hour_of_day, + day_of_week, + sample_count +); \ No newline at end of file diff --git a/deploy/prometheus/rules/siem-alerts.yml b/deploy/prometheus/rules/siem-alerts.yml index f9c7fa6..b6f2cee 100644 --- a/deploy/prometheus/rules/siem-alerts.yml +++ b/deploy/prometheus/rules/siem-alerts.yml @@ -1,5 +1,5 @@ groups: - - name: siem-backend + - name: siem-backend-availability rules: - alert: SiemBackendDown expr: up{job="siem-backend"} == 0 @@ -10,6 +10,26 @@ groups: summary: "SIEM backend nicht erreichbar" description: "Prometheus kann das SIEM-Backend seit mindestens 2 Minuten nicht scrapen." + - alert: SiemNoIngestEvents + expr: sum(rate(eventcollector_ingest_events_total[15m])) == 0 + for: 15m + labels: + severity: warning + annotations: + summary: "Keine eingehenden SIEM Events" + description: "Seit mindestens 15 Minuten wurden keine Events mehr ingestiert." + + - alert: SiemTooFewActiveAgents + expr: eventcollector_active_agents < 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Zu wenige aktive Agents" + description: "Es wurden weniger aktive Agents erkannt als erwartet." + + - name: siem-backend-detections + rules: - alert: SiemHighDetections expr: increase(eventcollector_detection_hits_total{severity="high"}[5m]) > 0 for: 1m @@ -19,6 +39,33 @@ groups: summary: "Neue High-Severity Detection" description: "Es wurde mindestens eine neue High-Severity-Detection in den letzten 5 Minuten erzeugt." + - alert: SiemManyMediumDetections + expr: sum(increase(eventcollector_detection_hits_total{severity="medium"}[15m])) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Viele Medium-Detections" + description: "Es wurden mehr als 10 Medium-Detections in 15 Minuten erzeugt." + + - alert: SiemBaselineHighAnomaly + expr: eventcollector_anomaly_score{rule="baseline_event_rate_anomaly"} >= 5 + for: 2m + labels: + severity: high + annotations: + summary: "Hohe Baseline-Anomalie" + description: "Host {{ $labels.host }} hat einen hohen Baseline-Z-Score: {{ $value }}." + + - alert: SiemBaselineMediumAnomaly + expr: eventcollector_anomaly_score{rule="baseline_event_rate_anomaly"} >= 3 + for: 5m + labels: + severity: warning + annotations: + summary: "Baseline-Anomalie" + description: "Host {{ $labels.host }} hat einen erhöhten Baseline-Z-Score: {{ $value }}." + - alert: SiemRuleErrors expr: increase(eventcollector_rule_errors_total[5m]) > 0 for: 1m @@ -28,11 +75,51 @@ groups: summary: "Fehler in Detection-Regeln" description: "Mindestens eine Detection-Regel hat in den letzten 5 Minuten einen Fehler erzeugt." - - alert: SiemTooFewActiveAgents - expr: eventcollector_active_agents < 1 + - name: siem-backend-ingest + rules: + - alert: SiemIngestRejected + expr: sum(increase(eventcollector_ingest_rejected_total[5m])) > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "Ingest Requests abgelehnt" + description: "In den letzten 5 Minuten wurden Ingest Requests abgelehnt." + + - alert: SiemDBInsertFailures + expr: increase(eventcollector_db_insert_failures_total[5m]) > 0 + for: 1m + labels: + severity: high + annotations: + summary: "DB Insert Fehler" + description: "Das SIEM-Backend konnte Events nicht in die Datenbank schreiben." + + - alert: SiemHighIngestRate + expr: sum(rate(eventcollector_ingest_events_total[5m])) > 500 for: 5m labels: severity: warning annotations: - summary: "Zu wenige aktive Agents" - description: "Es wurden weniger aktive Agents erkannt als erwartet." \ No newline at end of file + summary: "Sehr hohe Eventrate" + description: "Die Eventrate liegt seit 5 Minuten über 500 Events/s." + + - name: siem-backend-baseline + rules: + - alert: SiemBaselineNotEnoughSamples + expr: eventcollector_baseline_sample_count > 0 and eventcollector_baseline_sample_count < 24 + for: 30m + labels: + severity: info + annotations: + summary: "Baseline lernt noch" + description: "Für {{ $labels.host }} / {{ $labels.channel }} / {{ $labels.event_id }} gibt es erst {{ $value }} Samples." + + - alert: SiemBaselineCurrentFarAboveAverage + expr: eventcollector_baseline_avg_count > 0 and (eventcollector_baseline_current_count / eventcollector_baseline_avg_count) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Eventrate deutlich über Baseline" + description: "{{ $labels.host }} / {{ $labels.channel }} / {{ $labels.event_id }} liegt mehr als 10x über Durchschnitt." \ No newline at end of file diff --git a/dot_env b/dot_env index c67db39..7b73975 100644 --- a/dot_env +++ b/dot_env @@ -36,4 +36,18 @@ MARIADB_ROOT_PASSWORD=ROOTPASSWORT GRAFANA_ADMIN_USER=admin GRAFANA_ADMIN_PASSWORD=admin -ENROLLMENT_KEY=BITTE_SEHR_LANG_UND_ZUFAELLIG \ No newline at end of file +ENROLLMENT_KEY=BITTE_SEHR_LANG_UND_ZUFAELLIG + +BASELINE_ENABLED=true +BASELINE_WINDOW=5m +BASELINE_MIN_SAMPLES=24 +BASELINE_MIN_COUNT=10 +BASELINE_MEDIUM_Z=2.5 +BASELINE_HIGH_Z=4.0 +BASELINE_SUPPRESS_FOR=1h + + +#BASELINE_MIN_SAMPLES=84 +#BASELINE_MEDIUM_Z=3.0 +#BASELINE_HIGH_Z=5.0 +#BASELINE_MIN_COUNT=20 \ No newline at end of file diff --git a/main.go b/main.go index 5419863..b04efb7 100644 --- a/main.go +++ b/main.go @@ -85,6 +85,7 @@ const uiTemplates = ` Dashboard Agents Rules + Baseline Detections Events Metrics @@ -246,6 +247,53 @@ const uiTemplates = ` {{template "footer" .}} {{end}} +{{define "baseline"}} +{{template "header" .}} +

{{.Title}}

+

Baseline-Anomalien aus der Regel baseline_event_rate_anomaly.

+ +
+
+
+
+
+
+
+
+ +
+ + + + + + + + + + + + + + + {{range .Anomalies}} + + + + + + + + + + + + + {{end}} +
ZeitHostChannelEventIDSeverityAktuellBaselineZ-ScoreSamplesBucket
{{fmtTime .CreatedAt}}{{.Hostname}}{{.Channel}}{{.EventID}}{{.Severity}}{{.Count}}{{printf "%.2f" .AvgCount}} ± {{printf "%.2f" .StddevCount}}{{printf "%.2f" .ZScore}}{{.SampleCount}}Tag {{.DayOfWeek}}, Stunde {{.HourOfDay}}
+{{template "footer" .}} +{{end}} + {{define "events"}} {{template "header" .}}

{{.Title}}

@@ -391,6 +439,14 @@ type Config struct { DetectionsLimit int EnrollmentKey string + + BaselineEnabled bool + BaselineWindow time.Duration + BaselineMinSamples int + BaselineMinCount int + BaselineMediumZScore float64 + BaselineHighZScore float64 + BaselineSuppressFor time.Duration } type LogPayload struct { @@ -468,6 +524,11 @@ type detector struct { ruleLastRunGauge *prometheus.GaugeVec ruleRuntimeHist *prometheus.HistogramVec ruleErrorsTotal *prometheus.CounterVec + + baselineCurrentCountGauge *prometheus.GaugeVec + baselineAverageGauge *prometheus.GaugeVec + baselineStddevGauge *prometheus.GaugeVec + baselineSamplesGauge *prometheus.GaugeVec } type EventRow struct { @@ -574,6 +635,65 @@ type DynamicRulePageData struct { Rules []DynamicRule } +type BaselineBucket struct { + Hostname string + Channel string + EventID uint32 + Hour int + DayOfWeek int + Count int +} + +type BaselineStat struct { + AvgCount float64 + M2Count float64 + StddevCount float64 + SampleCount int +} + +type BaselineAnomalyRow struct { + ID uint64 + CreatedAt time.Time + Hostname string + Channel string + EventID uint32 + Severity string + Score float64 + WindowStart time.Time + WindowEnd time.Time + Summary string + + Count int + AvgCount float64 + StddevCount float64 + ZScore float64 + SampleCount int + HourOfDay int + DayOfWeek int + WindowMin int +} + +type BaselinePageData struct { + Title string + Now time.Time + Filters map[string]string + Anomalies []BaselineAnomalyRow +} + +type baselineDetailsJSON struct { + Hostname string `json:"hostname"` + Channel string `json:"channel"` + EventID uint32 `json:"event_id"` + Count int `json:"count"` + AvgCount float64 `json:"avg_count"` + StddevCount float64 `json:"stddev_count"` + ZScore float64 `json:"z_score"` + SampleCount int `json:"sample_count"` + HourOfDay int `json:"hour_of_day"` + DayOfWeek int `json:"day_of_week"` + WindowMinutes int `json:"window_minutes"` +} + var ( httpRequestsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{Name: "eventcollector_http_requests_total", Help: "Total HTTP requests."}, @@ -671,6 +791,34 @@ func main() { prometheus.CounterOpts{Name: "eventcollector_rule_errors_total", Help: "Rule execution errors."}, []string{"rule"}, ), + baselineCurrentCountGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "eventcollector_baseline_current_count", + Help: "Current event count in baseline window.", + }, + []string{"host", "channel", "event_id"}, + ), + baselineAverageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "eventcollector_baseline_avg_count", + Help: "Baseline average event count.", + }, + []string{"host", "channel", "event_id"}, + ), + baselineStddevGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "eventcollector_baseline_stddev_count", + Help: "Baseline standard deviation event count.", + }, + []string{"host", "channel", "event_id"}, + ), + baselineSamplesGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "eventcollector_baseline_sample_count", + Help: "Baseline sample count.", + }, + []string{"host", "channel", "event_id"}, + ), } reg.MustRegister( d.lastSeenGauge, @@ -680,6 +828,10 @@ func main() { d.ruleLastRunGauge, d.ruleRuntimeHist, d.ruleErrorsTotal, + d.baselineCurrentCountGauge, + d.baselineAverageGauge, + d.baselineStddevGauge, + d.baselineSamplesGauge, ) s := &server{ @@ -726,6 +878,7 @@ func main() { mux.HandleFunc("/ui/rules", s.handleUIRules) mux.HandleFunc("/ui/rules/save", s.handleUIRuleSave) mux.HandleFunc("/ui/rules/toggle", s.handleUIRuleToggle) + mux.HandleFunc("/ui/baseline", s.handleUIBaseline) httpSrv := &http.Server{ Addr: cfg.ListenAddr, @@ -759,6 +912,142 @@ func main() { } } +func (s *server) listBaselineAnomalies(ctx context.Context, host, channel, severity string, eventID uint32, limit int) ([]BaselineAnomalyRow, error) { + if limit <= 0 || limit > 1000 { + limit = 100 + } + + query := ` +SELECT id, severity, hostname, channel_name, event_id, score, + window_start, window_end, summary, details_json, created_at +FROM detections +WHERE rule_name = 'baseline_event_rate_anomaly' +` + args := make([]any, 0, 8) + + if host != "" { + query += ` AND hostname = ?` + args = append(args, host) + } + if channel != "" { + query += ` AND channel_name = ?` + args = append(args, channel) + } + if eventID != 0 { + query += ` AND event_id = ?` + args = append(args, eventID) + } + if severity != "" { + query += ` AND severity = ?` + args = append(args, severity) + } + + query += ` ORDER BY created_at DESC LIMIT ?` + args = append(args, limit) + + rows, err := s.db.QueryContext(ctx, query, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + out := make([]BaselineAnomalyRow, 0) + + for rows.Next() { + var row BaselineAnomalyRow + var detailsRaw []byte + + if err := rows.Scan( + &row.ID, + &row.Severity, + &row.Hostname, + &row.Channel, + &row.EventID, + &row.Score, + &row.WindowStart, + &row.WindowEnd, + &row.Summary, + &detailsRaw, + &row.CreatedAt, + ); err != nil { + return nil, err + } + + var details baselineDetailsJSON + if err := json.Unmarshal(detailsRaw, &details); err == nil { + row.Count = details.Count + row.AvgCount = details.AvgCount + row.StddevCount = details.StddevCount + row.ZScore = details.ZScore + row.SampleCount = details.SampleCount + row.HourOfDay = details.HourOfDay + row.DayOfWeek = details.DayOfWeek + row.WindowMin = details.WindowMinutes + } else { + row.ZScore = row.Score + } + + out = append(out, row) + } + + return out, rows.Err() +} + +func (s *server) handleUIBaseline(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + filters := map[string]string{ + "host": strings.TrimSpace(r.URL.Query().Get("host")), + "channel": strings.TrimSpace(r.URL.Query().Get("channel")), + "event_id": strings.TrimSpace(r.URL.Query().Get("event_id")), + "severity": strings.TrimSpace(r.URL.Query().Get("severity")), + "limit": strings.TrimSpace(r.URL.Query().Get("limit")), + } + + limit := 100 + if filters["limit"] != "" { + if n, err := strconv.Atoi(filters["limit"]); err == nil && n > 0 && n <= 1000 { + limit = n + } + } + + var eventID uint32 + if filters["event_id"] != "" { + if n, err := strconv.ParseUint(filters["event_id"], 10, 32); err == nil { + eventID = uint32(n) + } + } + + ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second) + defer cancel() + + items, err := s.listBaselineAnomalies( + ctx, + filters["host"], + filters["channel"], + filters["severity"], + eventID, + limit, + ) + if err != nil { + s.logger.Printf("ui baseline: %v", err) + writeError(w, http.StatusInternalServerError, "internal error") + return + } + + data := BaselinePageData{ + Title: "Baseline-Anomalien", + Now: time.Now(), + Filters: filters, + Anomalies: items, + } + + s.renderTemplate(w, "baseline", data) +} + func (s *server) listDynamicRules(ctx context.Context) ([]DynamicRule, error) { const q = ` SELECT id, @@ -1457,9 +1746,48 @@ func loadConfig() Config { DetectionsLimit: getenvInt("DETECTIONS_LIMIT", 100), EnrollmentKey: mustGetenv("ENROLLMENT_KEY"), + + BaselineEnabled: getenvBool("BASELINE_ENABLED", true), + BaselineWindow: getenvDuration("BASELINE_WINDOW", 5*time.Minute), + BaselineMinSamples: getenvInt("BASELINE_MIN_SAMPLES", 24), + BaselineMinCount: getenvInt("BASELINE_MIN_COUNT", 10), + BaselineMediumZScore: getenvFloat("BASELINE_MEDIUM_Z", 2.5), + BaselineHighZScore: getenvFloat("BASELINE_HIGH_Z", 4.0), + BaselineSuppressFor: getenvDuration("BASELINE_SUPPRESS_FOR", 1*time.Hour), } } +func getenvBool(key string, def bool) bool { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return def + } + + switch strings.ToLower(v) { + case "1", "true", "yes", "y", "on": + return true + case "0", "false", "no", "n", "off": + return false + default: + log.Fatalf("invalid bool for %s: %s", key, v) + return def + } +} + +func getenvFloat(key string, def float64) float64 { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return def + } + + f, err := strconv.ParseFloat(v, 64) + if err != nil { + log.Fatalf("invalid float for %s: %v", key, err) + } + + return f +} + func (s *server) handleHealthz(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { ingestRejectedTotal.WithLabelValues("method_not_allowed").Inc() @@ -1826,6 +2154,337 @@ func (s *server) runDetectionLoop() { } } +func (d *detector) runBaselineUpdate(ctx context.Context) error { + if !d.cfg.BaselineEnabled { + return nil + } + + windowEnd := time.Now().UTC() + windowStart := windowEnd.Add(-d.cfg.BaselineWindow) + + rows, err := d.db.QueryContext(ctx, ` +SELECT + hostname, + channel_name, + event_id, + HOUR(ts) AS hour_of_day, + WEEKDAY(ts) AS day_of_week, + COUNT(*) AS cnt +FROM event_logs +WHERE ts >= ? AND ts < ? +GROUP BY hostname, channel_name, event_id, HOUR(ts), WEEKDAY(ts) +`, windowStart, windowEnd) + if err != nil { + return err + } + defer rows.Close() + + for rows.Next() { + var b BaselineBucket + if err := rows.Scan( + &b.Hostname, + &b.Channel, + &b.EventID, + &b.Hour, + &b.DayOfWeek, + &b.Count, + ); err != nil { + return err + } + + if err := d.updateBaselineBucket(ctx, b); err != nil { + return err + } + } + + return rows.Err() +} + +func (d *detector) updateBaselineBucket(ctx context.Context, b BaselineBucket) error { + tx, err := d.db.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + var stat BaselineStat + + err = tx.QueryRowContext(ctx, ` +SELECT avg_count, m2_count, stddev_count, sample_count +FROM baseline_event_stats +WHERE hostname = ? + AND channel_name = ? + AND event_id = ? + AND hour_of_day = ? + AND day_of_week = ? +FOR UPDATE +`, + b.Hostname, + b.Channel, + b.EventID, + b.Hour, + b.DayOfWeek, + ).Scan( + &stat.AvgCount, + &stat.M2Count, + &stat.StddevCount, + &stat.SampleCount, + ) + + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return err + } + + x := float64(b.Count) + + if errors.Is(err, sql.ErrNoRows) { + _, err := tx.ExecContext(ctx, ` +INSERT INTO baseline_event_stats +(hostname, channel_name, event_id, hour_of_day, day_of_week, + avg_count, m2_count, stddev_count, sample_count) +VALUES (?, ?, ?, ?, ?, ?, 0, 0, 1) +`, + b.Hostname, + b.Channel, + b.EventID, + b.Hour, + b.DayOfWeek, + x, + ) + if err != nil { + return err + } + + return tx.Commit() + } + + newSamples := stat.SampleCount + 1 + delta := x - stat.AvgCount + newAvg := stat.AvgCount + delta/float64(newSamples) + delta2 := x - newAvg + newM2 := stat.M2Count + delta*delta2 + + newStddev := 0.0 + if newSamples > 1 { + newStddev = math.Sqrt(newM2 / float64(newSamples-1)) + } + + _, err = tx.ExecContext(ctx, ` +UPDATE baseline_event_stats +SET avg_count = ?, + m2_count = ?, + stddev_count = ?, + sample_count = ?, + last_updated = CURRENT_TIMESTAMP(6) +WHERE hostname = ? + AND channel_name = ? + AND event_id = ? + AND hour_of_day = ? + AND day_of_week = ? +`, + newAvg, + newM2, + newStddev, + newSamples, + b.Hostname, + b.Channel, + b.EventID, + b.Hour, + b.DayOfWeek, + ) + if err != nil { + return err + } + + return tx.Commit() +} + +func (d *detector) runBaselineAnomalyRule(ctx context.Context) error { + if !d.cfg.BaselineEnabled { + return nil + } + + windowEnd := time.Now().UTC() + windowStart := windowEnd.Add(-d.cfg.BaselineWindow) + + rows, err := d.db.QueryContext(ctx, ` +SELECT + e.hostname, + e.channel_name, + e.event_id, + HOUR(e.ts) AS hour_of_day, + WEEKDAY(e.ts) AS day_of_week, + COUNT(*) AS cnt, + b.avg_count, + b.stddev_count, + b.sample_count +FROM event_logs e +JOIN baseline_event_stats b + ON b.hostname = e.hostname + AND b.channel_name = e.channel_name + AND b.event_id = e.event_id + AND b.hour_of_day = HOUR(e.ts) + AND b.day_of_week = WEEKDAY(e.ts) +WHERE e.ts >= ? AND e.ts < ? +GROUP BY + e.hostname, + e.channel_name, + e.event_id, + HOUR(e.ts), + WEEKDAY(e.ts), + b.avg_count, + b.stddev_count, + b.sample_count +`, windowStart, windowEnd) + if err != nil { + return err + } + defer rows.Close() + + for rows.Next() { + var host string + var channel string + var eventID uint32 + var hour int + var dayOfWeek int + var count int + var avg float64 + var stddev float64 + var samples int + + if err := rows.Scan( + &host, + &channel, + &eventID, + &hour, + &dayOfWeek, + &count, + &avg, + &stddev, + &samples, + ); err != nil { + return err + } + + eventIDStr := strconv.Itoa(int(eventID)) + + d.baselineCurrentCountGauge.WithLabelValues(host, channel, eventIDStr).Set(float64(count)) + d.baselineAverageGauge.WithLabelValues(host, channel, eventIDStr).Set(avg) + d.baselineStddevGauge.WithLabelValues(host, channel, eventIDStr).Set(stddev) + d.baselineSamplesGauge.WithLabelValues(host, channel, eventIDStr).Set(float64(samples)) + + if samples < d.cfg.BaselineMinSamples { + continue + } + + if count < d.cfg.BaselineMinCount { + continue + } + + if stddev <= 0 { + continue + } + + z := (float64(count) - avg) / stddev + + if z < d.cfg.BaselineMediumZScore { + continue + } + + severity := "medium" + if z >= d.cfg.BaselineHighZScore { + severity = "high" + } + + suppressed, err := d.isBaselineSuppressed(ctx, host, channel, eventID, windowEnd) + if err != nil { + return err + } + if suppressed { + continue + } + + score := z + + created, err := d.insertDetection(ctx, Detection{ + RuleName: "baseline_event_rate_anomaly", + Severity: severity, + Hostname: host, + Channel: channel, + EventID: eventID, + Score: score, + WindowStart: windowStart, + WindowEnd: windowEnd, + Summary: fmt.Sprintf( + "Baseline-Anomalie auf %s: %s EventID %d kam %d-mal in %d Minuten, normal %.2f ± %.2f, z=%.2f", + host, + channel, + eventID, + count, + int(d.cfg.BaselineWindow.Minutes()), + avg, + stddev, + z, + ), + Details: mustJSON(map[string]any{ + "hostname": host, + "channel": channel, + "event_id": eventID, + "count": count, + "avg_count": avg, + "stddev_count": stddev, + "z_score": z, + "sample_count": samples, + "hour_of_day": hour, + "day_of_week": dayOfWeek, + "window_minutes": int(d.cfg.BaselineWindow.Minutes()), + "min_samples": d.cfg.BaselineMinSamples, + "medium_z": d.cfg.BaselineMediumZScore, + "high_z": d.cfg.BaselineHighZScore, + }), + }) + if err != nil { + return err + } + + if created { + d.detectionHitsTotal.WithLabelValues("baseline_event_rate_anomaly", severity).Inc() + d.anomalyScoreGauge.WithLabelValues(host, "baseline_event_rate_anomaly").Set(score) + } + } + + return rows.Err() +} + +func (d *detector) isBaselineSuppressed(ctx context.Context, hostname, channel string, eventID uint32, now time.Time) (bool, error) { + if d.cfg.BaselineSuppressFor <= 0 { + return false, nil + } + + since := now.UTC().Add(-d.cfg.BaselineSuppressFor) + + var count int + err := d.db.QueryRowContext(ctx, ` +SELECT COUNT(*) +FROM detections +WHERE rule_name = 'baseline_event_rate_anomaly' + AND hostname = ? + AND channel_name = ? + AND event_id = ? + AND created_at >= ? +`, + hostname, + channel, + eventID, + since, + ).Scan(&count) + if err != nil { + return false, err + } + + return count > 0, nil +} + func (d *detector) runDynamicRules(ctx context.Context) error { rows, err := d.db.QueryContext(ctx, ` SELECT id, name, description, severity, channel, event_ids, @@ -2287,6 +2946,9 @@ func (s *server) runDetectionsOnce() { {"success_after_failures", s.detector.runSuccessAfterFailuresRule}, {"new_source_ip_for_user", s.detector.runNewSourceIPForUserRule}, {"dynamic_rules", s.detector.runDynamicRules}, + + {"baseline_anomaly", s.detector.runBaselineAnomalyRule}, + {"baseline_update", s.detector.runBaselineUpdate}, } for _, rule := range rules {