From 3904df3988fc956380ac3ebc315d9a4f9c801196 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Tue, 26 Aug 2025 08:05:01 -0400 Subject: [PATCH 1/8] feat(logging): add datadog context --- warehouse/logging.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/warehouse/logging.py b/warehouse/logging.py index 9c383716cf62..7857e90dd9cc 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -36,6 +36,23 @@ def _create_id(request): return str(uuid.uuid4()) +def _add_datadog_context(logger, method_name, event_dict): + """Add Datadog trace context if available""" + try: + import ddtrace + span = ddtrace.tracer.current_span() + if span: + event_dict["dd.trace_id"] = str(span.trace_id) + event_dict["dd.span_id"] = str(span.span_id) + event_dict["dd.service"] = span.service + # deployment metadata + event_dict["dd.env"] = os.environ.get("DD_ENV", "development") + event_dict["dd.version"] = os.environ.get("DD_VERSION", "unknown") + except (ImportError, AttributeError): + pass + return event_dict + + def _create_logger(request): # This has to use **{} instead of just a kwarg because request.id is not # an allowed kwarg name. @@ -90,6 +107,7 @@ def includeme(config): structlog.stdlib.PositionalArgumentsFormatter(), structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, + _add_datadog_context, RENDERER, ], logger_factory=structlog.stdlib.LoggerFactory(), From 23a8bb8218d04b2bb17571a379e7aa91ef3b2353 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Tue, 26 Aug 2025 08:32:28 -0400 Subject: [PATCH 2/8] fix(logging): unify the stamps of time --- warehouse/logging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/warehouse/logging.py b/warehouse/logging.py index 7857e90dd9cc..f64abe68163c 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -105,6 +105,7 @@ def includeme(config): structlog.stdlib.add_logger_name, structlog.stdlib.add_log_level, structlog.stdlib.PositionalArgumentsFormatter(), + structlog.processors.TimeStamper(fmt="iso"), structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, _add_datadog_context, From 21b294562c860774703f66ebd45e408572c0031b Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Mon, 8 Sep 2025 14:15:02 -0500 Subject: [PATCH 3/8] feat(logging): gunicorn extra parsing --- tests/unit/test_logging.py | 74 ++++++++++++++++++++++++++++++++++++-- warehouse/logging.py | 63 ++++++++++++++++++++++++++++---- 2 files changed, 129 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_logging.py b/tests/unit/test_logging.py index 53ce3e1a6f3b..d68693ad0a16 100644 --- a/tests/unit/test_logging.py +++ b/tests/unit/test_logging.py @@ -37,6 +37,70 @@ def test_non_warehouse_logger_renders(self): "thread": threading.get_ident(), } + def test_gunicorn_access_log_parsing(self): + formatter = wlogging.StructlogFormatter() + access_log_line = ( + "bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355 - - " + '[11/Aug/2025:21:01:13 +0000] "GET /pypi/b5ee/json HTTP/1.1" 404 24 ' + '"-" "dependabot-core/0.325.1 excon/1.2.5 ruby/3.4.5 (x86_64-linux) ' + '(+https://github.com/dependabot/dependabot-core)"' + ) + record = logging.LogRecord( + "gunicorn.access", logging.INFO, None, None, access_log_line, None, None + ) + + result = json.loads(formatter.format(record)) + + assert result["logger"] == "gunicorn.access" + assert result["level"] == "INFO" + assert result["event"] == "http_request" + expected_id = "bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355" + assert result["request_id"] == expected_id + assert result["method"] == "GET" + assert result["path"] == "/pypi/b5ee/json" + assert result["protocol"] == "HTTP/1.1" + assert result["status"] == 404 + assert result["response_size"] == 24 + assert result["referrer"] is None + assert "dependabot-core" in result["user_agent"] + assert "thread" in result + + def test_gunicorn_access_log_parsing_with_referrer(self): + formatter = wlogging.StructlogFormatter() + access_log_line = ( + "test-id-123 - - " + '[12/Aug/2025:10:30:45 +0000] "POST /simple/upload HTTP/1.1" 200 500 ' + '"https://pypi.org/project/test/" "Mozilla/5.0 (compatible; test)"' + ) + record = logging.LogRecord( + "gunicorn.access", logging.INFO, None, None, access_log_line, None, None + ) + + result = json.loads(formatter.format(record)) + + assert result["request_id"] == "test-id-123" + assert result["method"] == "POST" + assert result["path"] == "/simple/upload" + assert result["status"] == 200 + assert result["response_size"] == 500 + assert result["referrer"] == "https://pypi.org/project/test/" + assert result["user_agent"] == "Mozilla/5.0 (compatible; test)" + + def test_gunicorn_access_log_unparsable(self): + formatter = wlogging.StructlogFormatter() + malformed_log = "this is not a valid access log format" + record = logging.LogRecord( + "gunicorn.access", logging.INFO, None, None, malformed_log, None, None + ) + + result = json.loads(formatter.format(record)) + + assert result["logger"] == "gunicorn.access" + assert result["level"] == "INFO" + assert result["event"] == "http_request_unparsed" + assert result["raw_message"] == "this is not a valid access log format" + assert "thread" in result + def test_create_id(monkeypatch): uuid4 = pretend.call_recorder(lambda: "a fake uuid") @@ -119,9 +183,11 @@ def test_includeme(monkeypatch, settings, expected_level): structlog.stdlib.filter_by_level, structlog.stdlib.add_logger_name, structlog.stdlib.add_log_level, - mock.ANY, - mock.ANY, + mock.ANY, # PositionalArgumentsFormatter + mock.ANY, # TimeStamper + mock.ANY, # StackInfoRenderer structlog.processors.format_exc_info, + mock.ANY, # _add_datadog_context wlogging.RENDERER, ], logger_factory=mock.ANY, @@ -135,6 +201,10 @@ def test_includeme(monkeypatch, settings, expected_level): ) assert isinstance( configure.calls[0].kwargs["processors"][4], + structlog.processors.TimeStamper, + ) + assert isinstance( + configure.calls[0].kwargs["processors"][5], structlog.processors.StackInfoRenderer, ) assert isinstance( diff --git a/warehouse/logging.py b/warehouse/logging.py index f64abe68163c..35c73f136940 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import logging.config +import os +import re import threading import uuid @@ -8,16 +10,64 @@ request_logger = structlog.get_logger("warehouse.request") -RENDERER = structlog.processors.JSONRenderer() +# Determine if we're in development mode +DEV_MODE = os.environ.get("WAREHOUSE_ENV") == "development" + +# Choose renderer based on environment +RENDERER: structlog.dev.ConsoleRenderer | structlog.processors.JSONRenderer +if DEV_MODE: + RENDERER = structlog.dev.ConsoleRenderer(colors=True) +else: + RENDERER = structlog.processors.JSONRenderer() class StructlogFormatter(logging.Formatter): + # Pattern to parse Gunicorn access logs + ACCESS_LOG_PATTERN = re.compile( + r"(?P[\w-]+) - - " + r"\[(?P[^\]]+)\] " + r'"(?P\w+) (?P[^\s]+) (?P[^"]+)" ' + r"(?P\d+) (?P\d+) " + r'"(?P[^"]*)" ' + r'"(?P[^"]*)"' + ) + def format(self, record): - # TODO: Figure out a better way of handling this besides just looking - # at the logger name, ideally this would have some way to - # really differentiate between log items which were logged by - # structlog and which were not. - if not record.name.startswith("warehouse."): + # Handle Gunicorn access logs with structured parsing + if record.name == "gunicorn.access": + match = self.ACCESS_LOG_PATTERN.match(record.msg) + if match: + event_dict = { + "logger": record.name, + "level": record.levelname, + "event": "http_request", + "request_id": match.group("request_id"), + "method": match.group("method"), + "path": match.group("path"), + "protocol": match.group("protocol"), + "status": int(match.group("status")), + "response_size": int(match.group("size")), + "referrer": ( + match.group("referrer") + if match.group("referrer") != "-" + else None + ), + "user_agent": match.group("user_agent"), + "thread": threading.get_ident(), + } + record.msg = RENDERER(None, record.levelname, event_dict) + else: + # Fallback for access logs that don't match the expected format + event_dict = { + "logger": record.name, + "level": record.levelname, + "event": "http_request_unparsed", + "raw_message": record.msg, + "thread": threading.get_ident(), + } + record.msg = RENDERER(None, record.levelname, event_dict) + # Handle other non-warehouse logs + elif not record.name.startswith("warehouse."): # TODO: Is there a better way to handle this? Maybe we can figure # out a way to pass this through the structlog processors # instead of manually duplicating the side effects here? @@ -40,6 +90,7 @@ def _add_datadog_context(logger, method_name, event_dict): """Add Datadog trace context if available""" try: import ddtrace + span = ddtrace.tracer.current_span() if span: event_dict["dd.trace_id"] = str(span.trace_id) From a7acddd2d333bab8a4651d1c2341cef6aa726c8a Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Wed, 24 Sep 2025 13:54:45 -0500 Subject: [PATCH 4/8] feat(logging): celery structured --- warehouse/logging.py | 33 +++++++++++++++++++++++++++++++++ warehouse/tasks.py | 20 ++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/warehouse/logging.py b/warehouse/logging.py index 35c73f136940..c374aaab4eeb 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -104,6 +104,39 @@ def _add_datadog_context(logger, method_name, event_dict): return event_dict +def configure_celery_logging(logfile: str | None = None, loglevel: int = logging.INFO): + """Configure unified structlog logging for Celery that handles all log types.""" + processors = [ + structlog.contextvars.merge_contextvars, + structlog.processors.TimeStamper(fmt="iso"), + structlog.stdlib.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + _add_datadog_context, + ] + formatter = structlog.stdlib.ProcessorFormatter( + processor=RENDERER, + foreign_pre_chain=processors, + ) + + handler = logging.FileHandler(logfile) if logfile else logging.StreamHandler() + handler.setFormatter(formatter) + + root = logging.getLogger() + root.handlers.clear() + root.addHandler(handler) + root.setLevel(loglevel) + + structlog.configure( + processors=processors + [ + structlog.stdlib.ProcessorFormatter.wrap_for_formatter, + ], + logger_factory=structlog.stdlib.LoggerFactory(), + wrapper_class=structlog.make_filtering_bound_logger(logging.INFO), + cache_logger_on_first_use=True, + ) + + def _create_logger(request): # This has to use **{} instead of just a kwarg because request.id is not # an allowed kwarg name. diff --git a/warehouse/tasks.py b/warehouse/tasks.py index bad2e6e51bc9..f1ac20a045a4 100644 --- a/warehouse/tasks.py +++ b/warehouse/tasks.py @@ -14,8 +14,10 @@ import celery.backends.redis import pyramid.scripting import pyramid_retry +import structlog import transaction import venusian +from celery import signals from kombu import Queue from pyramid.threadlocal import get_current_request @@ -36,6 +38,20 @@ logger = logging.getLogger(__name__) +# Celery signal handlers for unified structlog configuration +@signals.after_setup_logger.connect +def on_after_setup_logger(logger, loglevel, logfile, *args, **kwargs): + """Override Celery's default logging behavior with unified structlog configuration.""" + from warehouse.logging import configure_celery_logging + configure_celery_logging(logfile, loglevel) + + +@signals.task_prerun.connect +def on_task_prerun(sender, task_id, task, **_): + """Bind task metadata to contextvars for all logs within the task.""" + structlog.contextvars.bind_contextvars(task_id=task_id, task_name=task.name) + + class TLSRedisBackend(celery.backends.redis.RedisBackend): def _params_from_url(self, url, defaults): params = super()._params_from_url(url, defaults) @@ -301,6 +317,10 @@ def includeme(config): REDBEAT_REDIS_URL=s["celery.scheduler_url"], # Silence deprecation warning on startup broker_connection_retry_on_startup=False, + # Disable Celery's logger hijacking for unified structlog control + worker_hijack_root_logger=False, + worker_log_format="%(message)s", + worker_task_log_format="%(message)s", ) config.registry["celery.app"].Task = WarehouseTask config.registry["celery.app"].pyramid_config = config From 87d6d236bf7cba6c0413c6aa04a65864828c38bb Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Wed, 24 Sep 2025 16:29:25 -0500 Subject: [PATCH 5/8] fix: gunicorn --- warehouse/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warehouse/logging.py b/warehouse/logging.py index c374aaab4eeb..a0567ff32c0c 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -24,7 +24,7 @@ class StructlogFormatter(logging.Formatter): # Pattern to parse Gunicorn access logs ACCESS_LOG_PATTERN = re.compile( - r"(?P[\w-]+) - - " + r"(?P[\d\.]+) - - " r"\[(?P[^\]]+)\] " r'"(?P\w+) (?P[^\s]+) (?P[^"]+)" ' r"(?P\d+) (?P\d+) " @@ -41,7 +41,7 @@ def format(self, record): "logger": record.name, "level": record.levelname, "event": "http_request", - "request_id": match.group("request_id"), + "remote_addr": match.group("remote_addr"), "method": match.group("method"), "path": match.group("path"), "protocol": match.group("protocol"), From 44854a047965a846ffc350fb8b08422af9785d4b Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Wed, 24 Sep 2025 16:46:17 -0500 Subject: [PATCH 6/8] fix: gunicorn but for real --- warehouse/logging.py | 121 +++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 62 deletions(-) diff --git a/warehouse/logging.py b/warehouse/logging.py index a0567ff32c0c..01d76b42b35d 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -21,65 +21,6 @@ RENDERER = structlog.processors.JSONRenderer() -class StructlogFormatter(logging.Formatter): - # Pattern to parse Gunicorn access logs - ACCESS_LOG_PATTERN = re.compile( - r"(?P[\d\.]+) - - " - r"\[(?P[^\]]+)\] " - r'"(?P\w+) (?P[^\s]+) (?P[^"]+)" ' - r"(?P\d+) (?P\d+) " - r'"(?P[^"]*)" ' - r'"(?P[^"]*)"' - ) - - def format(self, record): - # Handle Gunicorn access logs with structured parsing - if record.name == "gunicorn.access": - match = self.ACCESS_LOG_PATTERN.match(record.msg) - if match: - event_dict = { - "logger": record.name, - "level": record.levelname, - "event": "http_request", - "remote_addr": match.group("remote_addr"), - "method": match.group("method"), - "path": match.group("path"), - "protocol": match.group("protocol"), - "status": int(match.group("status")), - "response_size": int(match.group("size")), - "referrer": ( - match.group("referrer") - if match.group("referrer") != "-" - else None - ), - "user_agent": match.group("user_agent"), - "thread": threading.get_ident(), - } - record.msg = RENDERER(None, record.levelname, event_dict) - else: - # Fallback for access logs that don't match the expected format - event_dict = { - "logger": record.name, - "level": record.levelname, - "event": "http_request_unparsed", - "raw_message": record.msg, - "thread": threading.get_ident(), - } - record.msg = RENDERER(None, record.levelname, event_dict) - # Handle other non-warehouse logs - elif not record.name.startswith("warehouse."): - # TODO: Is there a better way to handle this? Maybe we can figure - # out a way to pass this through the structlog processors - # instead of manually duplicating the side effects here? - event_dict = { - "logger": record.name, - "level": record.levelname, - "event": record.msg, - "thread": threading.get_ident(), - } - record.msg = RENDERER(None, record.levelname, event_dict) - - return super().format(record) def _create_id(request): @@ -104,6 +45,47 @@ def _add_datadog_context(logger, method_name, event_dict): return event_dict +def _parse_gunicorn_access_log(logger, method_name, event_dict): + """Parse Gunicorn logs into structlog ((only access logs).""" + if event_dict.get("logger") != "gunicorn.access": + return event_dict + + message = event_dict.get("event", "") + + # based on https://albersdevelopment.net/2019/08/15/using-structlog-with-gunicorn/ and friends + # Combined log format: host - user [time] "request" status size "referer" "user-agent" + pattern = re.compile( + r'(?P\S+) \S+ (?P\S+) ' + r'\[(?P.+?)\] "(?P.+?)" ' + r'(?P\d+) (?P\S+) ' + r'"(?P.*?)" "(?P.*?)"' + ) + + match = pattern.match(message) + if not match: + return event_dict + + fields = match.groupdict() + + # sanitize + fields["user"] = None if fields["user"] == "-" else fields["user"] + fields["status"] = int(fields["status"]) + fields["size"] = 0 if fields["size"] == "-" else int(fields["size"]) + fields["referrer"] = None if fields["referrer"] == "-" else fields["referrer"] + + # Parse "GET /path HTTP/1.1" into separate fields + request_parts = fields["request"].split(" ", 2) + if len(request_parts) >= 2: + fields["method"] = request_parts[0] + fields["path"] = request_parts[1] + if len(request_parts) == 3: + fields["protocol"] = request_parts[2] + + event_dict.update(fields) + event_dict["event"] = "http_request" + return event_dict + + def configure_celery_logging(logfile: str | None = None, loglevel: int = logging.INFO): """Configure unified structlog logging for Celery that handles all log types.""" processors = [ @@ -144,17 +126,32 @@ def _create_logger(request): def includeme(config): + # non structlog thigns + foreign_pre_chain = [ + structlog.stdlib.add_log_level, + structlog.stdlib.add_logger_name, + structlog.processors.TimeStamper(fmt="iso"), + _add_datadog_context, + _parse_gunicorn_access_log, + ] + # Configure the standard library logging logging.config.dictConfig( { "version": 1, "disable_existing_loggers": False, - "formatters": {"structlog": {"()": "warehouse.logging.StructlogFormatter"}}, + "formatters": { + "structlog_formatter": { + "()": structlog.stdlib.ProcessorFormatter, + "processor": RENDERER, + "foreign_pre_chain": foreign_pre_chain, + } + }, "handlers": { "primary": { "class": "logging.StreamHandler", "stream": "ext://sys.stdout", - "formatter": "structlog", + "formatter": "structlog_formatter", }, }, "loggers": { @@ -193,7 +190,7 @@ def includeme(config): structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, _add_datadog_context, - RENDERER, + structlog.stdlib.ProcessorFormatter.wrap_for_formatter, ], logger_factory=structlog.stdlib.LoggerFactory(), wrapper_class=structlog.stdlib.BoundLogger, From 5b322dcadd115d2693632834bbf5972b1bfe77a3 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Wed, 24 Sep 2025 16:49:38 -0500 Subject: [PATCH 7/8] ci: lint --- warehouse/logging.py | 17 +++++++++-------- warehouse/tasks.py | 7 +++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/warehouse/logging.py b/warehouse/logging.py index 01d76b42b35d..21937cfba340 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -3,7 +3,6 @@ import logging.config import os import re -import threading import uuid import structlog @@ -21,8 +20,6 @@ RENDERER = structlog.processors.JSONRenderer() - - def _create_id(request): return str(uuid.uuid4()) @@ -52,12 +49,15 @@ def _parse_gunicorn_access_log(logger, method_name, event_dict): message = event_dict.get("event", "") - # based on https://albersdevelopment.net/2019/08/15/using-structlog-with-gunicorn/ and friends - # Combined log format: host - user [time] "request" status size "referer" "user-agent" + # based on + # https://albersdevelopment.net/2019/08/15/using-structlog-with-gunicorn/ + # and friends + # Combined log format: + # host - user [time] "request" status size "referer" "user-agent" pattern = re.compile( - r'(?P\S+) \S+ (?P\S+) ' + r"(?P\S+) \S+ (?P\S+) " r'\[(?P.+?)\] "(?P.+?)" ' - r'(?P\d+) (?P\S+) ' + r"(?P\d+) (?P\S+) " r'"(?P.*?)" "(?P.*?)"' ) @@ -110,7 +110,8 @@ def configure_celery_logging(logfile: str | None = None, loglevel: int = logging root.setLevel(loglevel) structlog.configure( - processors=processors + [ + processors=processors + + [ structlog.stdlib.ProcessorFormatter.wrap_for_formatter, ], logger_factory=structlog.stdlib.LoggerFactory(), diff --git a/warehouse/tasks.py b/warehouse/tasks.py index f1ac20a045a4..b2929b8ada95 100644 --- a/warehouse/tasks.py +++ b/warehouse/tasks.py @@ -17,8 +17,8 @@ import structlog import transaction import venusian -from celery import signals +from celery import signals from kombu import Queue from pyramid.threadlocal import get_current_request @@ -39,10 +39,13 @@ # Celery signal handlers for unified structlog configuration +# https://github.com/hynek/structlog/issues/287 +# https://www.structlog.org/en/stable/frameworks.html#celery @signals.after_setup_logger.connect def on_after_setup_logger(logger, loglevel, logfile, *args, **kwargs): - """Override Celery's default logging behavior with unified structlog configuration.""" + """Override Celery's default logging behavior w/ unified structlog configuration.""" from warehouse.logging import configure_celery_logging + configure_celery_logging(logfile, loglevel) From 56d6a02bb2be33c9d3c7bca25dba06fb638e8d7d Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Thu, 25 Sep 2025 09:34:32 -0500 Subject: [PATCH 8/8] ai: tests, linting --- tests/unit/test_logging.py | 105 +++++++++++++++---------------------- warehouse/logging.py | 4 +- 2 files changed, 44 insertions(+), 65 deletions(-) diff --git a/tests/unit/test_logging.py b/tests/unit/test_logging.py index d68693ad0a16..7b4e970bf4cb 100644 --- a/tests/unit/test_logging.py +++ b/tests/unit/test_logging.py @@ -1,9 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -import json import logging import logging.config -import threading import uuid from unittest import mock @@ -15,91 +13,68 @@ from warehouse import logging as wlogging -class TestStructlogFormatter: - def test_warehouse_logger_no_renderer(self): - formatter = wlogging.StructlogFormatter() - record = logging.LogRecord( - "warehouse.request", logging.INFO, None, None, "the message", None, None - ) - - assert formatter.format(record) == "the message" - - def test_non_warehouse_logger_renders(self): - formatter = wlogging.StructlogFormatter() - record = logging.LogRecord( - "another.logger", logging.INFO, None, None, "the message", None, None - ) - - assert json.loads(formatter.format(record)) == { - "logger": "another.logger", - "level": "INFO", - "event": "the message", - "thread": threading.get_ident(), - } - - def test_gunicorn_access_log_parsing(self): - formatter = wlogging.StructlogFormatter() +class TestGunicornAccessLogParsing: + def test_parse_gunicorn_access_log_success(self): access_log_line = ( - "bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355 - - " + "192.168.1.1 - - " '[11/Aug/2025:21:01:13 +0000] "GET /pypi/b5ee/json HTTP/1.1" 404 24 ' - '"-" "dependabot-core/0.325.1 excon/1.2.5 ruby/3.4.5 (x86_64-linux) ' - '(+https://github.com/dependabot/dependabot-core)"' - ) - record = logging.LogRecord( - "gunicorn.access", logging.INFO, None, None, access_log_line, None, None + '"-" "dependabot-core/0.325.1 excon/1.2.5 ruby/3.4.5 (x86_64-linux)"' ) - result = json.loads(formatter.format(record)) + event_dict = {"logger": "gunicorn.access", "event": access_log_line} + + result = wlogging._parse_gunicorn_access_log(None, None, event_dict) - assert result["logger"] == "gunicorn.access" - assert result["level"] == "INFO" assert result["event"] == "http_request" - expected_id = "bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355" - assert result["request_id"] == expected_id + assert result["remote_addr"] == "192.168.1.1" + assert result["user"] is None + assert result["timestamp"] == "11/Aug/2025:21:01:13 +0000" + assert result["request"] == "GET /pypi/b5ee/json HTTP/1.1" assert result["method"] == "GET" assert result["path"] == "/pypi/b5ee/json" assert result["protocol"] == "HTTP/1.1" assert result["status"] == 404 - assert result["response_size"] == 24 + assert result["size"] == 24 assert result["referrer"] is None assert "dependabot-core" in result["user_agent"] - assert "thread" in result - def test_gunicorn_access_log_parsing_with_referrer(self): - formatter = wlogging.StructlogFormatter() + def test_parse_gunicorn_access_log_with_referrer(self): access_log_line = ( - "test-id-123 - - " + "192.168.1.1 - - " '[12/Aug/2025:10:30:45 +0000] "POST /simple/upload HTTP/1.1" 200 500 ' '"https://pypi.org/project/test/" "Mozilla/5.0 (compatible; test)"' ) - record = logging.LogRecord( - "gunicorn.access", logging.INFO, None, None, access_log_line, None, None - ) - result = json.loads(formatter.format(record)) + event_dict = {"logger": "gunicorn.access", "event": access_log_line} - assert result["request_id"] == "test-id-123" + result = wlogging._parse_gunicorn_access_log(None, None, event_dict) + + assert result["remote_addr"] == "192.168.1.1" assert result["method"] == "POST" assert result["path"] == "/simple/upload" assert result["status"] == 200 - assert result["response_size"] == 500 + assert result["size"] == 500 assert result["referrer"] == "https://pypi.org/project/test/" assert result["user_agent"] == "Mozilla/5.0 (compatible; test)" - def test_gunicorn_access_log_unparsable(self): - formatter = wlogging.StructlogFormatter() - malformed_log = "this is not a valid access log format" - record = logging.LogRecord( - "gunicorn.access", logging.INFO, None, None, malformed_log, None, None - ) + def test_parse_gunicorn_access_log_unparsable(self): + event_dict = { + "logger": "gunicorn.access", + "event": "this is not a valid access log format", + } + + result = wlogging._parse_gunicorn_access_log(None, None, event_dict) + + # Should return unchanged if unparsable + assert result["event"] == "this is not a valid access log format" + + def test_parse_gunicorn_access_log_non_access_log(self): + event_dict = {"logger": "some.other.logger", "event": "Some message"} - result = json.loads(formatter.format(record)) + result = wlogging._parse_gunicorn_access_log(None, None, event_dict) - assert result["logger"] == "gunicorn.access" - assert result["level"] == "INFO" - assert result["event"] == "http_request_unparsed" - assert result["raw_message"] == "this is not a valid access log format" - assert "thread" in result + # Should return unchanged for non-access logs + assert result == event_dict def test_create_id(monkeypatch): @@ -146,13 +121,17 @@ def test_includeme(monkeypatch, settings, expected_level): "version": 1, "disable_existing_loggers": False, "formatters": { - "structlog": {"()": "warehouse.logging.StructlogFormatter"} + "structlog_formatter": { + "()": structlog.stdlib.ProcessorFormatter, + "processor": mock.ANY, + "foreign_pre_chain": mock.ANY, + } }, "handlers": { "primary": { "class": "logging.StreamHandler", "stream": "ext://sys.stdout", - "formatter": "structlog", + "formatter": "structlog_formatter", }, }, "loggers": { @@ -188,7 +167,7 @@ def test_includeme(monkeypatch, settings, expected_level): mock.ANY, # StackInfoRenderer structlog.processors.format_exc_info, mock.ANY, # _add_datadog_context - wlogging.RENDERER, + structlog.stdlib.ProcessorFormatter.wrap_for_formatter, ], logger_factory=mock.ANY, wrapper_class=structlog.stdlib.BoundLogger, diff --git a/warehouse/logging.py b/warehouse/logging.py index 21937cfba340..1b54df360b30 100644 --- a/warehouse/logging.py +++ b/warehouse/logging.py @@ -88,7 +88,7 @@ def _parse_gunicorn_access_log(logger, method_name, event_dict): def configure_celery_logging(logfile: str | None = None, loglevel: int = logging.INFO): """Configure unified structlog logging for Celery that handles all log types.""" - processors = [ + processors: list = [ structlog.contextvars.merge_contextvars, structlog.processors.TimeStamper(fmt="iso"), structlog.stdlib.add_log_level, @@ -128,7 +128,7 @@ def _create_logger(request): def includeme(config): # non structlog thigns - foreign_pre_chain = [ + foreign_pre_chain: list = [ structlog.stdlib.add_log_level, structlog.stdlib.add_logger_name, structlog.processors.TimeStamper(fmt="iso"),