diff --git a/server/lib/sanitize.ts b/server/lib/sanitize.ts new file mode 100644 index 000000000..9eba8a583 --- /dev/null +++ b/server/lib/sanitize.ts @@ -0,0 +1,40 @@ +/** + * Sanitize a string field before inserting into a database TEXT column. + * + * Two passes are applied: + * + * 1. Lone UTF-16 surrogates – JavaScript strings can hold unpaired surrogates + * (e.g. \uD800 without a following \uDC00-\uDFFF codepoint). These are + * valid in JS but cannot be encoded as UTF-8, triggering + * `report_invalid_encoding` in SQLite / Postgres. They are replaced with + * the Unicode replacement character U+FFFD so the data is preserved as a + * visible signal that something was malformed. + * + * 2. Null bytes and C0 control characters – SQLite stores TEXT as + * null-terminated C strings, so \x00 in a value causes + * `report_invalid_encoding`. Bots and scanners routinely inject null bytes + * into URLs (e.g. `/path\u0000.jpg`). All C0 control characters in the + * range \x00-\x1F are stripped except for the three that are legitimate in + * text payloads: HT (\x09), LF (\x0A), and CR (\x0D). DEL (\x7F) is also + * stripped. + */ +export function sanitizeString(value: string): string; +export function sanitizeString( + value: string | null | undefined +): string | undefined; +export function sanitizeString( + value: string | null | undefined +): string | undefined { + if (value == null) return undefined; + return ( + value + // Replace lone high surrogates (not followed by a low surrogate) + // and lone low surrogates (not preceded by a high surrogate). + .replace( + /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? ({ timestamp: logEntry.timestamp, - orgId: logEntry.orgId, - actorType: logEntry.actorType, - actor: logEntry.actor, - actorId: logEntry.actorId, - metadata: logEntry.metadata, + orgId: sanitizeString(logEntry.orgId), + actorType: sanitizeString(logEntry.actorType), + actor: sanitizeString(logEntry.actor), + actorId: sanitizeString(logEntry.actorId), + metadata: sanitizeString(logEntry.metadata), action: logEntry.action, resourceId: logEntry.resourceId, reason: logEntry.reason, - location: logEntry.location, + location: sanitizeString(logEntry.location), // userAgent: data.userAgent, // TODO: add this // headers: data.body.headers, // query: data.body.query, - originalRequestURL: logEntry.originalRequestURL, - scheme: logEntry.scheme, - host: logEntry.host, - path: logEntry.path, - method: logEntry.method, - ip: logEntry.ip, + originalRequestURL: sanitizeString(logEntry.originalRequestURL) ?? "", + scheme: sanitizeString(logEntry.scheme) ?? "", + host: sanitizeString(logEntry.host) ?? "", + path: sanitizeString(logEntry.path) ?? "", + method: sanitizeString(logEntry.method) ?? "", + ip: sanitizeString(logEntry.ip), tls: logEntry.tls })); diff --git a/server/routers/badger/logRequestAudit.ts b/server/routers/badger/logRequestAudit.ts index 4dd5bff99..92d01332e 100644 --- a/server/routers/badger/logRequestAudit.ts +++ b/server/routers/badger/logRequestAudit.ts @@ -5,25 +5,7 @@ import cache from "#dynamic/lib/cache"; import { calculateCutoffTimestamp } from "@server/lib/cleanupLogs"; import { stripPortFromHost } from "@server/lib/ip"; -/** - * Sanitize a string field by replacing lone UTF-16 surrogates (which cannot - * be encoded as valid UTF-8) with the Unicode replacement character, and - * stripping ASCII control characters that are invalid in most text columns. - */ -function sanitizeString(value: string | undefined | null): string | undefined { - if (value == null) return undefined; - return ( - value - // Replace lone high surrogates (not followed by a low surrogate) - // and lone low surrogates (not preceded by a high surrogate) - .replace( - /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?