ref(seer-grouping): Adjust types for group_id to hash switch (#70070)

lobsterkatie · web-flow · commit 617d44b90221 · 2024-05-06T12:44:23.000-07:00
This makes a few changes and additions to our types for Seer similar issue requests, in preparation for switching from sending the group id to sending the group hash. Specifically: - Make `group_id` optional in Seer request and response types, and add an optional `group_hash` property to both. - Add a `SeerSimilarIssueData` dataclass, to hold data from Seer about a single similar issue along with the issue's group id. Notes: - Though right now the contents of `SeerSimilarIssueData` is the same shape as that in `SimilarIssuesEmbeddingsData`, I chose to create a new type rather than reuse the existing one because once we make the `group_id` to `group_hash` switch, they will differ, in that the data which comes back from Seer will have the group's hash and the data we pass around will have the group's id. - I changed the name of `SimilarIssuesEmbeddingsData` to `RawSeerSimilarIssueData` to match the new dataclass. I'm not wedded to these names (both `SeerSimilarIssueData` and `RawSeerSimilarIssueData`) and open to suggestions here, but I specifically went away from the `SimilarIssuesEmbeddingsXXXXX` pattern because a) it made them easier to distinguish from the request and reaponse types, and b) I needed a name which indicated that the data is about a single similar issue rather than all of the similar issues and simply changing "Issues" to "Issue" wasn't obvious enough, and c) in `SimilarIssuesEmbeddingsXXXXX`, the "Issues" is really part of the "similar issues" descriptor on "embeddings", not something naming the contents of the data structure, so I needed "issue" to appear later on in the phrase. I could have gone with `SimilarIssuesEmbeddingsIssueData` and `RawSimilarIssuesEmbeddingsIssueData`, but those seemed a little cumbersome. I could be talked into it, though. - To keep things manageable, I'm going to do the switch to actually using `SeerSimilarIssueData` in a separate PR. [UPDATE: Done in #70240.]
diff --git a/src/sentry/api/endpoints/group_similar_issues_embeddings.py b/src/sentry/api/endpoints/group_similar_issues_embeddings.py
@@ -16,7 +16,7 @@
 from sentry.models.group import Group
 from sentry.models.user import User
 from sentry.seer.utils import (
-    SimilarIssuesEmbeddingsData,
+    RawSeerSimilarIssueData,
     SimilarIssuesEmbeddingsRequest,
     get_similar_issues_embeddings,
 )
@@ -108,7 +108,7 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
 
     def get_formatted_results(
         self,
-        similar_issues_data: Sequence[SimilarIssuesEmbeddingsData],
+        similar_issues_data: Sequence[RawSeerSimilarIssueData],
         user: User | AnonymousUser,
     ) -> Sequence[tuple[Mapping[str, Any], Mapping[str, Any]] | None]:
         """
diff --git a/src/sentry/seer/utils.py b/src/sentry/seer/utils.py
@@ -1,4 +1,5 @@
 import logging
+from dataclasses import dataclass
 from typing import NotRequired, TypedDict
 
 import sentry_sdk
@@ -85,23 +86,36 @@ def detect_breakpoints(breakpoint_request) -> BreakpointResponse:
 
 
 class SimilarIssuesEmbeddingsRequest(TypedDict):
-    group_id: int
     project_id: int
     stacktrace: str
     message: str
     k: NotRequired[int]  # how many neighbors to find
     threshold: NotRequired[float]
+    group_id: NotRequired[int]  # TODO: Remove this once we stop sending it to seer
+    group_hash: NotRequired[str]  # TODO: Make this required once id -> hash change is done
 
 
-class SimilarIssuesEmbeddingsData(TypedDict):
-    parent_group_id: int
+class RawSeerSimilarIssueData(TypedDict):
     stacktrace_distance: float
     message_distance: float
     should_group: bool
+    parent_group_id: NotRequired[int]  # TODO: Remove this once seer stops sending it
+    parent_group_hash: NotRequired[str]  # TODO: Make this required once id -> hash change is done
 
 
 class SimilarIssuesEmbeddingsResponse(TypedDict):
-    responses: list[SimilarIssuesEmbeddingsData]
+    responses: list[RawSeerSimilarIssueData]
+
+
+# Like the data that comes back from seer, but guaranteed to have a parent group id
+@dataclass
+class SeerSimilarIssueData:
+    stacktrace_distance: float
+    message_distance: float
+    should_group: bool
+    parent_group_id: int
+    # TODO: See if we end up needing the hash here
+    parent_group_hash: str | None = None
 
 
 def get_similar_issues_embeddings(
diff --git a/tests/sentry/api/endpoints/test_group_similar_issues_embeddings.py b/tests/sentry/api/endpoints/test_group_similar_issues_embeddings.py
@@ -11,7 +11,7 @@
 )
 from sentry.api.serializers.base import serialize
 from sentry.models.group import Group
-from sentry.seer.utils import SimilarIssuesEmbeddingsData, SimilarIssuesEmbeddingsResponse
+from sentry.seer.utils import RawSeerSimilarIssueData, SimilarIssuesEmbeddingsResponse
 from sentry.testutils.cases import APITestCase
 from sentry.testutils.helpers.features import with_feature
 from sentry.utils import json
@@ -652,13 +652,13 @@ def test_get_stacktrace_string_no_exception(self):
 
     def test_get_formatted_results(self):
         new_group = self.create_group(project=self.project)
-        response_1: SimilarIssuesEmbeddingsData = {
+        response_1: RawSeerSimilarIssueData = {
             "message_distance": 0.05,
             "parent_group_id": self.similar_group.id,
             "should_group": True,
             "stacktrace_distance": 0.01,
         }
-        response_2: SimilarIssuesEmbeddingsData = {
+        response_2: RawSeerSimilarIssueData = {
             "message_distance": 0.49,
             "parent_group_id": new_group.id,
             "should_group": False,