Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 32 additions & 20 deletions src/sentry/tasks/backfill_seer_grouping_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from sentry.issues.grouptype import ErrorGroupType
from sentry.issues.occurrence_consumer import EventLookupError
from sentry.models.group import Group, GroupStatus
from sentry.models.grouphash import GroupHash
from sentry.models.project import Project
from sentry.seer.similarity.backfill import (
CreateGroupingRecordData,
Expand Down Expand Up @@ -221,12 +220,8 @@ def backfill_seer_grouping_records(
group_id_batch.remove(group_id)
del group_id_message_batch_filtered[group_id]

group_hashes = GroupHash.objects.filter(
project_id=project.id, group_id__in=group_id_batch
).distinct("group_id")
group_hashes_dict = {group_hash.group_id: group_hash.hash for group_hash in group_hashes}
data = lookup_group_data_stacktrace_bulk_with_fallback(
project, rows, group_id_message_batch_filtered, group_hashes_dict
project, rows, group_id_message_batch_filtered
)

# If nodestore returns no data
Expand All @@ -245,6 +240,11 @@ def backfill_seer_grouping_records(
)
return

group_hashes_dict = {
group_stacktrace_data["group_id"]: group_stacktrace_data["hash"]
for group_stacktrace_data in data["data"]
}

with metrics.timer(f"{BACKFILL_NAME}.post_bulk_grouping_records", sample_rate=1.0):
response = post_bulk_grouping_records(
CreateGroupingRecordsRequest(
Expand All @@ -258,7 +258,7 @@ def backfill_seer_grouping_records(
groups_with_neighbor = response["groups_with_neighbor"]
groups = Group.objects.filter(project_id=project.id, id__in=group_id_batch)
for group in groups:
seer_similarity = {
seer_similarity: dict[str, Any] = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": group_hashes_dict[group.id],
}
Expand Down Expand Up @@ -332,19 +332,19 @@ def backfill_seer_grouping_records(


def lookup_group_data_stacktrace_bulk_with_fallback(
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
) -> GroupStacktraceData:
(
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(project, rows, messages)
for row in rows:
event_id, group_id = row["event_id"], row["group_id"]
if event_id not in bulk_event_ids and event_id not in invalid_event_ids:
try:
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
project, event_id, int(group_id), messages[group_id], hashes[group_id]
project, event_id, int(group_id), messages[group_id]
)
if group_data and stacktrace_string:
bulk_group_data_stacktraces["data"].append(group_data)
Expand Down Expand Up @@ -374,7 +374,7 @@ def lookup_group_data_stacktrace_bulk_with_fallback(

@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0)
def lookup_group_data_stacktrace_bulk(
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
) -> tuple[set[str], set[str], GroupStacktraceData]:
project_id = project.id
node_id_to_group_data = {
Expand Down Expand Up @@ -421,18 +421,23 @@ def lookup_group_data_stacktrace_bulk(
)
event = Event(event_id=event_id, project_id=project_id, group_id=group_id)
event.data = data
if event and event.data and event.data.get("exception") and hashes.get(group_id):
if event and event.data and event.data.get("exception"):
grouping_info = get_grouping_info(None, project=project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
if stacktrace_string == "":
invalid_event_ids.add(event_id)
continue
primary_hash = event.get_primary_hash()
if not primary_hash:
invalid_event_ids.add(event_id)
continue

group_data.append(
CreateGroupingRecordData(
group_id=group_id,
project_id=project_id,
message=messages[group_id],
hash=hashes[group_id],
hash=primary_hash,
)
)
stacktrace_strings.append(stacktrace_string)
Expand All @@ -455,7 +460,7 @@ def lookup_group_data_stacktrace_bulk(

@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_single")
def lookup_group_data_stacktrace_single(
project: Project, event_id: str, group_id: int, message: str, hash: str
project: Project, event_id: str, group_id: int, message: str
) -> tuple[CreateGroupingRecordData | None, str]:
project_id = project.id
try:
Expand Down Expand Up @@ -485,13 +490,20 @@ def lookup_group_data_stacktrace_single(
with sentry_sdk.start_transaction(op="embeddings_grouping.get_latest_event"):
grouping_info = get_grouping_info(None, project=project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
group_data = (
CreateGroupingRecordData(
group_id=group_id, hash=hash, project_id=project_id, message=message
primary_hash = event.get_primary_hash()
if not primary_hash:
group_data = None
else:
group_data = (
CreateGroupingRecordData(
group_id=group_id,
hash=primary_hash,
project_id=project_id,
message=message,
)
if stacktrace_string != ""
else None
)
if stacktrace_string != ""
else None
)

return (group_data, stacktrace_string)

Expand Down
128 changes: 10 additions & 118 deletions tests/sentry/tasks/test_backfill_seer_grouping_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_lookup_group_data_stacktrace_single_success(self):
event = self.event
hash = self.group_hashes[event.group.id]
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
expected_group_data = CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -200,7 +200,6 @@ def test_lookup_group_data_stacktrace_single_exceptions(
event.event_id,
event.group_id,
event.group.message,
self.group_hashes[event.group.id],
)
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.event_lookup_exception",
Expand All @@ -221,18 +220,16 @@ def test_lookup_group_data_stacktrace_single_not_stacktrace_grouping(self):
project_id=self.project.id,
assert_no_errors=False,
)
hash = GroupHash.objects.get(group_id=event.group.id)
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
assert (group_data, stacktrace_string) == (None, "")

def test_lookup_group_data_stacktrace_single_no_stacktrace(self):
"""Test that no data is returned if the event has no stacktrace"""
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
hash = GroupHash.objects.get(group_id=event.group.id)
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
assert (group_data, stacktrace_string) == (None, "")

Expand All @@ -244,7 +241,7 @@ def test_lookup_group_data_stacktrace_bulk_success(self, mock_metrics):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)

expected_event_ids = {event.event_id for event in events}
expected_group_data = [
Expand Down Expand Up @@ -287,7 +284,7 @@ def test_lookup_group_data_stacktrace_bulk_exceptions(
for exception in exceptions:
mock_get_multi.side_effect = exception
with pytest.raises(Exception):
lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
lookup_group_data_stacktrace_bulk(self.project, rows, messages)
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
extra={
Expand Down Expand Up @@ -323,7 +320,7 @@ def test_lookup_group_data_stacktrace_bulk_not_stacktrace_grouping(self):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -363,46 +360,7 @@ def test_lookup_group_data_stacktrace_bulk_no_stacktrace_exception(self):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
hash=hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
for event in events
]
expected_stacktraces = [
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
for i in range(2)
]
assert bulk_event_ids == {event.event_id for event in events}
assert invalid_event_ids == {event.event_id}
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces

def test_lookup_group_data_stacktrace_bulk_no_hash(self):
"""
Test that if a group does not have a hash (for whatever reason), its data is not included
in the bulk lookup result
"""
# Use 2 events
rows, events, messages, hashes = self.bulk_rows[:2], self.bulk_events[:2], {}, {}
group_ids = [row["group_id"] for row in rows]
for group_id in group_ids:
messages.update({group_id: self.bulk_messages[group_id]})
hashes.update({group_id: self.group_hashes[group_id]})
# Create one event with no hash
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
rows.append({"event_id": event.event_id, "group_id": event.group_id})
messages.update({event.group_id: event.group.message})

(
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -430,7 +388,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_success(self):
self.group_hashes,
)
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes
self.project, rows, messages
)

expected_group_data = [
Expand Down Expand Up @@ -480,7 +438,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(

rows, messages, hashes = self.bulk_rows, self.bulk_messages, self.group_hashes
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes=hashes
self.project, rows, messages
)

events = self.bulk_events
Expand All @@ -500,72 +458,6 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces

@patch("sentry.tasks.backfill_seer_grouping_records.logger")
@patch("sentry.tasks.backfill_seer_grouping_records.lookup_group_data_stacktrace_bulk")
def test_lookup_group_data_stacktrace_bulk_with_fallback_no_hash(
self, mock_lookup_group_data_stacktrace_bulk, mock_logger
):
"""
Test that if a group does not have a hash (for whatever reason), we do not attempt the
fallback and we log it
"""
# Purposely exclude one event from being included in the bulk lookup response, so that the fallback is used
events_missing = self.bulk_events[:-1]
group_data, stacktrace_strings = [], []
for event in events_missing:
grouping_info = get_grouping_info(None, project=self.project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
group_data.append(
CreateGroupingRecordData(
group_id=event.group.id,
hash=self.group_hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
)
stacktrace_strings.append(stacktrace_string)
mock_lookup_group_data_stacktrace_bulk.return_value = (
{event.event_id for event in events_missing},
set(),
GroupStacktraceData(data=group_data, stacktrace_list=stacktrace_strings),
)

# Purposely remove the hash for the missing event
hashes = copy.deepcopy(self.group_hashes)
del hashes[self.bulk_events[-1].group.id]

rows, messages = self.bulk_rows, self.bulk_messages
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes=hashes
)

events = self.bulk_events[:-1]
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
hash=hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
for event in events
]
expected_stacktraces = [
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
for i in range(4)
]
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.no_group_hash",
extra={
"organization_id": self.project.organization.id,
"project_id": self.project.id,
"group_id": self.bulk_events[-1].group_id,
},
)

@patch("sentry.tasks.backfill_seer_grouping_records.logger")
def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self, mock_logger):
"""
Expand All @@ -581,7 +473,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self
rows[-1]["event_id"] = 10000

bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes
self.project, rows, messages
)

events = self.bulk_events[:-1]
Expand Down