diff --git a/src/sentry/tasks/backfill_seer_grouping_records.py b/src/sentry/tasks/backfill_seer_grouping_records.py index 2f7215e2cc048c..7e395dda74af4b 100644 --- a/src/sentry/tasks/backfill_seer_grouping_records.py +++ b/src/sentry/tasks/backfill_seer_grouping_records.py @@ -19,7 +19,6 @@ from sentry.issues.grouptype import ErrorGroupType from sentry.issues.occurrence_consumer import EventLookupError from sentry.models.group import Group, GroupStatus -from sentry.models.grouphash import GroupHash from sentry.models.project import Project from sentry.seer.similarity.backfill import ( CreateGroupingRecordData, @@ -221,12 +220,8 @@ def backfill_seer_grouping_records( group_id_batch.remove(group_id) del group_id_message_batch_filtered[group_id] - group_hashes = GroupHash.objects.filter( - project_id=project.id, group_id__in=group_id_batch - ).distinct("group_id") - group_hashes_dict = {group_hash.group_id: group_hash.hash for group_hash in group_hashes} data = lookup_group_data_stacktrace_bulk_with_fallback( - project, rows, group_id_message_batch_filtered, group_hashes_dict + project, rows, group_id_message_batch_filtered ) # If nodestore returns no data @@ -245,6 +240,11 @@ def backfill_seer_grouping_records( ) return + group_hashes_dict = { + group_stacktrace_data["group_id"]: group_stacktrace_data["hash"] + for group_stacktrace_data in data["data"] + } + with metrics.timer(f"{BACKFILL_NAME}.post_bulk_grouping_records", sample_rate=1.0): response = post_bulk_grouping_records( CreateGroupingRecordsRequest( @@ -258,7 +258,7 @@ def backfill_seer_grouping_records( groups_with_neighbor = response["groups_with_neighbor"] groups = Group.objects.filter(project_id=project.id, id__in=group_id_batch) for group in groups: - seer_similarity = { + seer_similarity: dict[str, Any] = { "similarity_model_version": SEER_SIMILARITY_MODEL_VERSION, "request_hash": group_hashes_dict[group.id], } @@ -332,19 +332,19 @@ def backfill_seer_grouping_records( def lookup_group_data_stacktrace_bulk_with_fallback( - project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str] + project: Project, rows: list[GroupEventRow], messages: dict[int, str] ) -> GroupStacktraceData: ( bulk_event_ids, invalid_event_ids, bulk_group_data_stacktraces, - ) = lookup_group_data_stacktrace_bulk(project, rows, messages, hashes) + ) = lookup_group_data_stacktrace_bulk(project, rows, messages) for row in rows: event_id, group_id = row["event_id"], row["group_id"] if event_id not in bulk_event_ids and event_id not in invalid_event_ids: try: group_data, stacktrace_string = lookup_group_data_stacktrace_single( - project, event_id, int(group_id), messages[group_id], hashes[group_id] + project, event_id, int(group_id), messages[group_id] ) if group_data and stacktrace_string: bulk_group_data_stacktraces["data"].append(group_data) @@ -374,7 +374,7 @@ def lookup_group_data_stacktrace_bulk_with_fallback( @metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0) def lookup_group_data_stacktrace_bulk( - project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str] + project: Project, rows: list[GroupEventRow], messages: dict[int, str] ) -> tuple[set[str], set[str], GroupStacktraceData]: project_id = project.id node_id_to_group_data = { @@ -421,18 +421,23 @@ def lookup_group_data_stacktrace_bulk( ) event = Event(event_id=event_id, project_id=project_id, group_id=group_id) event.data = data - if event and event.data and event.data.get("exception") and hashes.get(group_id): + if event and event.data and event.data.get("exception"): grouping_info = get_grouping_info(None, project=project, event=event) stacktrace_string = get_stacktrace_string(grouping_info) if stacktrace_string == "": invalid_event_ids.add(event_id) continue + primary_hash = event.get_primary_hash() + if not primary_hash: + invalid_event_ids.add(event_id) + continue + group_data.append( CreateGroupingRecordData( group_id=group_id, project_id=project_id, message=messages[group_id], - hash=hashes[group_id], + hash=primary_hash, ) ) stacktrace_strings.append(stacktrace_string) @@ -455,7 +460,7 @@ def lookup_group_data_stacktrace_bulk( @metrics.wraps(f"{BACKFILL_NAME}.lookup_event_single") def lookup_group_data_stacktrace_single( - project: Project, event_id: str, group_id: int, message: str, hash: str + project: Project, event_id: str, group_id: int, message: str ) -> tuple[CreateGroupingRecordData | None, str]: project_id = project.id try: @@ -485,13 +490,20 @@ def lookup_group_data_stacktrace_single( with sentry_sdk.start_transaction(op="embeddings_grouping.get_latest_event"): grouping_info = get_grouping_info(None, project=project, event=event) stacktrace_string = get_stacktrace_string(grouping_info) - group_data = ( - CreateGroupingRecordData( - group_id=group_id, hash=hash, project_id=project_id, message=message + primary_hash = event.get_primary_hash() + if not primary_hash: + group_data = None + else: + group_data = ( + CreateGroupingRecordData( + group_id=group_id, + hash=primary_hash, + project_id=project_id, + message=message, + ) + if stacktrace_string != "" + else None ) - if stacktrace_string != "" - else None - ) return (group_data, stacktrace_string) diff --git a/tests/sentry/tasks/test_backfill_seer_grouping_records.py b/tests/sentry/tasks/test_backfill_seer_grouping_records.py index ce6e51fbad327b..9037b110713d8f 100644 --- a/tests/sentry/tasks/test_backfill_seer_grouping_records.py +++ b/tests/sentry/tasks/test_backfill_seer_grouping_records.py @@ -165,7 +165,7 @@ def test_lookup_group_data_stacktrace_single_success(self): event = self.event hash = self.group_hashes[event.group.id] group_data, stacktrace_string = lookup_group_data_stacktrace_single( - self.project, event.event_id, event.group_id, event.group.message, hash + self.project, event.event_id, event.group_id, event.group.message ) expected_group_data = CreateGroupingRecordData( group_id=event.group.id, @@ -200,7 +200,6 @@ def test_lookup_group_data_stacktrace_single_exceptions( event.event_id, event.group_id, event.group.message, - self.group_hashes[event.group.id], ) mock_logger.exception.assert_called_with( "tasks.backfill_seer_grouping_records.event_lookup_exception", @@ -221,18 +220,16 @@ def test_lookup_group_data_stacktrace_single_not_stacktrace_grouping(self): project_id=self.project.id, assert_no_errors=False, ) - hash = GroupHash.objects.get(group_id=event.group.id) group_data, stacktrace_string = lookup_group_data_stacktrace_single( - self.project, event.event_id, event.group_id, event.group.message, hash + self.project, event.event_id, event.group_id, event.group.message ) assert (group_data, stacktrace_string) == (None, "") def test_lookup_group_data_stacktrace_single_no_stacktrace(self): """Test that no data is returned if the event has no stacktrace""" event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False) - hash = GroupHash.objects.get(group_id=event.group.id) group_data, stacktrace_string = lookup_group_data_stacktrace_single( - self.project, event.event_id, event.group_id, event.group.message, hash + self.project, event.event_id, event.group_id, event.group.message ) assert (group_data, stacktrace_string) == (None, "") @@ -244,7 +241,7 @@ def test_lookup_group_data_stacktrace_bulk_success(self, mock_metrics): bulk_event_ids, invalid_event_ids, bulk_group_data_stacktraces, - ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes) + ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages) expected_event_ids = {event.event_id for event in events} expected_group_data = [ @@ -287,7 +284,7 @@ def test_lookup_group_data_stacktrace_bulk_exceptions( for exception in exceptions: mock_get_multi.side_effect = exception with pytest.raises(Exception): - lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes) + lookup_group_data_stacktrace_bulk(self.project, rows, messages) mock_logger.exception.assert_called_with( "tasks.backfill_seer_grouping_records.bulk_event_lookup_exception", extra={ @@ -323,7 +320,7 @@ def test_lookup_group_data_stacktrace_bulk_not_stacktrace_grouping(self): bulk_event_ids, invalid_event_ids, bulk_group_data_stacktraces, - ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes) + ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages) expected_group_data = [ CreateGroupingRecordData( group_id=event.group.id, @@ -363,46 +360,7 @@ def test_lookup_group_data_stacktrace_bulk_no_stacktrace_exception(self): bulk_event_ids, invalid_event_ids, bulk_group_data_stacktraces, - ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes) - expected_group_data = [ - CreateGroupingRecordData( - group_id=event.group.id, - hash=hashes[event.group.id], - project_id=self.project.id, - message=event.group.message, - ) - for event in events - ] - expected_stacktraces = [ - f'Error{i}: error with value\n File "function_{i}.py", function function_{i}' - for i in range(2) - ] - assert bulk_event_ids == {event.event_id for event in events} - assert invalid_event_ids == {event.event_id} - assert bulk_group_data_stacktraces["data"] == expected_group_data - assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces - - def test_lookup_group_data_stacktrace_bulk_no_hash(self): - """ - Test that if a group does not have a hash (for whatever reason), its data is not included - in the bulk lookup result - """ - # Use 2 events - rows, events, messages, hashes = self.bulk_rows[:2], self.bulk_events[:2], {}, {} - group_ids = [row["group_id"] for row in rows] - for group_id in group_ids: - messages.update({group_id: self.bulk_messages[group_id]}) - hashes.update({group_id: self.group_hashes[group_id]}) - # Create one event with no hash - event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False) - rows.append({"event_id": event.event_id, "group_id": event.group_id}) - messages.update({event.group_id: event.group.message}) - - ( - bulk_event_ids, - invalid_event_ids, - bulk_group_data_stacktraces, - ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes) + ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages) expected_group_data = [ CreateGroupingRecordData( group_id=event.group.id, @@ -430,7 +388,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_success(self): self.group_hashes, ) bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback( - self.project, rows, messages, hashes + self.project, rows, messages ) expected_group_data = [ @@ -480,7 +438,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback( rows, messages, hashes = self.bulk_rows, self.bulk_messages, self.group_hashes bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback( - self.project, rows, messages, hashes=hashes + self.project, rows, messages ) events = self.bulk_events @@ -500,72 +458,6 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback( assert bulk_group_data_stacktraces["data"] == expected_group_data assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces - @patch("sentry.tasks.backfill_seer_grouping_records.logger") - @patch("sentry.tasks.backfill_seer_grouping_records.lookup_group_data_stacktrace_bulk") - def test_lookup_group_data_stacktrace_bulk_with_fallback_no_hash( - self, mock_lookup_group_data_stacktrace_bulk, mock_logger - ): - """ - Test that if a group does not have a hash (for whatever reason), we do not attempt the - fallback and we log it - """ - # Purposely exclude one event from being included in the bulk lookup response, so that the fallback is used - events_missing = self.bulk_events[:-1] - group_data, stacktrace_strings = [], [] - for event in events_missing: - grouping_info = get_grouping_info(None, project=self.project, event=event) - stacktrace_string = get_stacktrace_string(grouping_info) - group_data.append( - CreateGroupingRecordData( - group_id=event.group.id, - hash=self.group_hashes[event.group.id], - project_id=self.project.id, - message=event.group.message, - ) - ) - stacktrace_strings.append(stacktrace_string) - mock_lookup_group_data_stacktrace_bulk.return_value = ( - {event.event_id for event in events_missing}, - set(), - GroupStacktraceData(data=group_data, stacktrace_list=stacktrace_strings), - ) - - # Purposely remove the hash for the missing event - hashes = copy.deepcopy(self.group_hashes) - del hashes[self.bulk_events[-1].group.id] - - rows, messages = self.bulk_rows, self.bulk_messages - bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback( - self.project, rows, messages, hashes=hashes - ) - - events = self.bulk_events[:-1] - expected_group_data = [ - CreateGroupingRecordData( - group_id=event.group.id, - hash=hashes[event.group.id], - project_id=self.project.id, - message=event.group.message, - ) - for event in events - ] - expected_stacktraces = [ - f'Error{i}: error with value\n File "function_{i}.py", function function_{i}' - for i in range(4) - ] - assert bulk_group_data_stacktraces["data"] == expected_group_data - assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces - assert bulk_group_data_stacktraces["data"] == expected_group_data - assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces - mock_logger.exception.assert_called_with( - "tasks.backfill_seer_grouping_records.no_group_hash", - extra={ - "organization_id": self.project.organization.id, - "project_id": self.project.id, - "group_id": self.bulk_events[-1].group_id, - }, - ) - @patch("sentry.tasks.backfill_seer_grouping_records.logger") def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self, mock_logger): """ @@ -581,7 +473,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self rows[-1]["event_id"] = 10000 bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback( - self.project, rows, messages, hashes + self.project, rows, messages ) events = self.bulk_events[:-1]