diff --git a/.sampo/changesets/django-tracing-header-sanitization.md b/.sampo/changesets/django-tracing-header-sanitization.md new file mode 100644 index 00000000..47150fd5 --- /dev/null +++ b/.sampo/changesets/django-tracing-header-sanitization.md @@ -0,0 +1,5 @@ +--- +pypi/posthog: patch +--- + +Sanitize PostHog tracing headers extracted by Django middleware. diff --git a/posthog/integrations/django.py b/posthog/integrations/django.py index 7f05ba92..d0cca181 100644 --- a/posthog/integrations/django.py +++ b/posthog/integrations/django.py @@ -1,4 +1,6 @@ -from typing import TYPE_CHECKING, cast +import re +from typing import TYPE_CHECKING, Optional, cast + from posthog import contexts from posthog.client import Client @@ -18,7 +20,36 @@ def markcoroutinefunction(func): if TYPE_CHECKING: from django.http import HttpRequest, HttpResponse # noqa: F401 - from typing import Callable, Dict, Any, Optional, Union, Awaitable # noqa: F401 + from typing import Callable, Dict, Any, Union, Awaitable # noqa: F401 + + +_MAX_TRACING_HEADER_LENGTH = 1000 +_TRACING_HEADER_CONTROL_CHARS_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]") + + +def _sanitize_tracing_header_value(value) -> Optional[str]: + """Return a safe tracing header value, or None if the value is invalid. + + Tracing headers come from user-controlled HTTP requests and are copied into event properties. + Match the PostHog app's header sanitization: accept strings only, remove C0/C1 control + characters, trim surrounding whitespace, cap length, and drop empty results. + """ + if not isinstance(value, str) or not value: + return None + + return ( + _TRACING_HEADER_CONTROL_CHARS_RE.sub("", value).strip()[ + :_MAX_TRACING_HEADER_LENGTH + ] + or None + ) + + +def _get_sanitized_tracing_header(request, header_name) -> Optional[str]: + try: + return _sanitize_tracing_header_value(request.headers.get(header_name)) + except Exception: + return None class PosthogContextMiddleware: @@ -42,9 +73,10 @@ class PosthogContextMiddleware: You can use the `POSTHOG_MW_TAG_MAP` function to remove any default tags you don't want to capture, or override them with your own values. Context tags are automatically included as properties on all events captured within a context, including exceptions. - See the context documentation for more information. The extracted distinct ID and session ID, if found, are used to - associate all events captured in the middleware context with the same distinct ID and session as currently active on the - frontend. See the documentation for `set_context_session` and `identify_context` for more details. + See the context documentation for more information. The extracted distinct ID and session ID, + if found, are used to associate all events captured in the middleware context with the same distinct ID + and session as currently active on the frontend. See the documentation for `set_context_session` + and `identify_context` for more details. This middleware is hybrid-capable: it supports both WSGI (sync) and ASGI (async) Django applications. The middleware detects at initialization whether the next middleware in the chain is async or sync, and adapts its behavior accordingly. @@ -126,12 +158,14 @@ def _build_tags(self, request, user_id, user_email): tags = {} # Extract session ID from X-POSTHOG-SESSION-ID header - session_id = request.headers.get("X-POSTHOG-SESSION-ID") + session_id = _get_sanitized_tracing_header(request, "X-POSTHOG-SESSION-ID") if session_id: contexts.set_context_session(session_id) # Extract distinct ID from X-POSTHOG-DISTINCT-ID header or request user id - distinct_id = request.headers.get("X-POSTHOG-DISTINCT-ID") or user_id + distinct_id = ( + _get_sanitized_tracing_header(request, "X-POSTHOG-DISTINCT-ID") or user_id + ) if distinct_id: contexts.identify_context(distinct_id) diff --git a/posthog/test/integrations/test_middleware.py b/posthog/test/integrations/test_middleware.py index 18f5309f..8b4d399d 100644 --- a/posthog/test/integrations/test_middleware.py +++ b/posthog/test/integrations/test_middleware.py @@ -6,6 +6,7 @@ import unittest from unittest.mock import Mock, patch import asyncio +from parameterized import parameterized # Configure Django settings before importing middleware import django @@ -132,6 +133,73 @@ def test_extract_tags_partial_headers(self): self.assertIsNone(get_context_distinct_id()) self.assertEqual(tags["$request_method"], "PUT") + @parameterized.expand( + [ + ( + "session_control_chars", + "X-POSTHOG-SESSION-ID", + " session\n-\t123\x85 ", + get_context_session_id, + "session-123", + None, + ), + ( + "distinct_empty_falls_back_to_user", + "X-POSTHOG-DISTINCT-ID", + "\r\n ", + get_context_distinct_id, + "42", + 42, + ), + ] + ) + def test_extract_tags_sanitizes_tracing_header( + self, _name, header_name, raw_value, get_context_value, expected_value, user_pk + ): + """Test tracing header values are sanitized before entering context.""" + + with new_context(): + middleware = self.create_middleware() + request = MockRequest(headers={header_name: raw_value}, method="GET") + if user_pk is not None: + user = Mock() + user.is_authenticated = True + user.pk = user_pk + request.user = user + + middleware.extract_tags(request) + + self.assertEqual(get_context_value(), expected_value) + + @parameterized.expand( + [ + ( + "session_non_string", + "X-POSTHOG-SESSION-ID", + 123, + get_context_session_id, + ), + ( + "distinct_non_string", + "X-POSTHOG-DISTINCT-ID", + object(), + get_context_distinct_id, + ), + ] + ) + def test_extract_tags_ignores_non_string_tracing_header( + self, _name, header_name, raw_value, get_context_value + ): + """Test non-string tracing header values are ignored without throwing.""" + + with new_context(): + middleware = self.create_middleware() + request = MockRequest(headers={header_name: raw_value}, method="GET") + + middleware.extract_tags(request) + + self.assertIsNone(get_context_value()) + def test_extract_tags_with_extra_tags(self): """Test tag extraction with extra_tags function"""