Skip to content

Commit 75e6cf3

Browse files
mollyheamazonRoja Reddy Sareddy
authored andcommitted
Add telemetry and dog fooding fixes (#248)
* add telemetry to init experience, remove duplicate code in init_constants * add filter for deprecation warning, fix hyp --version * change default instance group name for instance group settings
1 parent 7631c87 commit 75e6cf3

File tree

10 files changed

+177
-72
lines changed

10 files changed

+177
-72
lines changed

examples/cluster_management/cluster_creation_sdk_experience.ipynb

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
},
4141
{
4242
"cell_type": "code",
43+
"execution_count": null,
4344
"metadata": {},
45+
"outputs": [],
4446
"source": [
4547
"import uuid\n",
4648
"import time\n",
@@ -83,7 +85,7 @@
8385
" instance_group_settings=[\n",
8486
" {\n",
8587
" \"InstanceCount\": 1,\n",
86-
" \"InstanceGroupName\": \"controller-group\",\n",
88+
" \"InstanceGroupName\": \"default\",\n",
8789
" \"InstanceType\": \"ml.t3.medium\",\n",
8890
" \"TargetAvailabilityZoneId\": \"use2-az2\",\n",
8991
" \"ThreadsPerCore\": 1,\n",
@@ -96,9 +98,7 @@
9698
"\n",
9799
"print(f\"Initialized cluster stack with prefix: {resource_prefix}\")\n",
98100
"print(f\"Cluster name: {cluster_stack.hyperpod_cluster_name}\")"
99-
],
100-
"outputs": [],
101-
"execution_count": null
101+
]
102102
},
103103
{
104104
"cell_type": "markdown",
@@ -117,7 +117,9 @@
117117
},
118118
{
119119
"cell_type": "code",
120+
"execution_count": null,
120121
"metadata": {},
122+
"outputs": [],
121123
"source": [
122124
"# Configure cluster with custom tags (equivalent to hyp configure --tags)\n",
123125
"cluster_tags = [\n",
@@ -143,9 +145,7 @@
143145
"\n",
144146
"print(f\"\\nNode recovery: {cluster_stack.node_recovery}\")\n",
145147
"print(f\"FSx storage capacity: {cluster_stack.storage_capacity} GiB\")"
146-
],
147-
"outputs": [],
148-
"execution_count": null
148+
]
149149
},
150150
{
151151
"cell_type": "markdown",
@@ -158,7 +158,9 @@
158158
},
159159
{
160160
"cell_type": "code",
161+
"execution_count": null,
161162
"metadata": {},
163+
"outputs": [],
162164
"source": [
163165
"# Display current configuration details\n",
164166
"print(\"=== Cluster Configuration ===\")\n",
@@ -176,9 +178,7 @@
176178
"print(f\" EKS Stack: {cluster_stack.create_eks_cluster_stack}\")\n",
177179
"print(f\" HyperPod Stack: {cluster_stack.create_hyperpod_cluster_stack}\")\n",
178180
"print(f\" FSx Stack: {cluster_stack.create_fsx_stack}\")"
179-
],
180-
"outputs": [],
181-
"execution_count": null
181+
]
182182
},
183183
{
184184
"cell_type": "markdown",
@@ -201,7 +201,9 @@
201201
},
202202
{
203203
"cell_type": "code",
204+
"execution_count": null,
204205
"metadata": {},
206+
"outputs": [],
205207
"source": [
206208
"# Create the HyperPod cluster (equivalent to hyp create)\n",
207209
"try:\n",
@@ -225,9 +227,7 @@
225227
"except Exception as e:\n",
226228
" print(f\"\\n❌ Cluster creation failed: {str(e)}\")\n",
227229
" raise"
228-
],
229-
"outputs": [],
230-
"execution_count": null
230+
]
231231
},
232232
{
233233
"cell_type": "markdown",
@@ -240,7 +240,9 @@
240240
},
241241
{
242242
"cell_type": "code",
243+
"execution_count": null,
243244
"metadata": {},
245+
"outputs": [],
244246
"source": [
245247
"# Monitor cluster creation progress\n",
246248
"def monitor_cluster_creation(stack_name, max_checks=30, interval=120):\n",
@@ -278,9 +280,7 @@
278280
"# Start monitoring (uncomment when cluster creation is initiated)\n",
279281
"# final_status = monitor_cluster_creation(stack_name, max_checks=5, interval=30)\n",
280282
"print(\"Monitoring function ready. Uncomment to start monitoring after cluster creation.\")"
281-
],
282-
"outputs": [],
283-
"execution_count": null
283+
]
284284
},
285285
{
286286
"cell_type": "markdown",
@@ -300,7 +300,9 @@
300300
},
301301
{
302302
"cell_type": "code",
303+
"execution_count": null,
303304
"metadata": {},
305+
"outputs": [],
304306
"source": [
305307
"# Get detailed information about the cluster stack (equivalent to hyp describe cluster-stack)\n",
306308
"def describe_cluster_stack(stack_name, region=\"us-east-2\"):\n",
@@ -347,9 +349,7 @@
347349
"# Describe the cluster stack (uncomment when stack exists)\n",
348350
"# describe_cluster_stack(stack_name)\n",
349351
"print(\"Describe function ready. Use after cluster creation is complete.\")"
350-
],
351-
"outputs": [],
352-
"execution_count": null
352+
]
353353
},
354354
{
355355
"cell_type": "markdown",
@@ -368,7 +368,9 @@
368368
},
369369
{
370370
"cell_type": "code",
371+
"execution_count": null,
371372
"metadata": {},
373+
"outputs": [],
372374
"source": [
373375
"# List all cluster stacks (equivalent to hyp list cluster-stack)\n",
374376
"def list_cluster_stacks(region=\"us-east-2\"):\n",
@@ -416,9 +418,7 @@
416418
" print(f\"\\n=== HyperPod Stacks ({len(hyperpod_stacks)}) ===\")\n",
417419
" for stack in hyperpod_stacks:\n",
418420
" print(f\" - {stack['StackName']} ({stack['StackStatus']})\")"
419-
],
420-
"outputs": [],
421-
"execution_count": null
421+
]
422422
},
423423
{
424424
"cell_type": "markdown",
@@ -439,7 +439,9 @@
439439
},
440440
{
441441
"cell_type": "code",
442+
"execution_count": null,
442443
"metadata": {},
444+
"outputs": [],
443445
"source": [
444446
"# Update cluster configuration using sagemaker-core Cluster class\n",
445447
"def update_cluster(cluster_name, region=\"us-east-2\"):\n",
@@ -524,9 +526,7 @@
524526
"# scaled_cluster = scale_instance_group(cluster_name, \"controller-group\", 2)\n",
525527
"\n",
526528
"print(\"Update functions ready. Use after cluster creation is complete.\")"
527-
],
528-
"outputs": [],
529-
"execution_count": null
529+
]
530530
},
531531
{
532532
"cell_type": "markdown",
@@ -539,7 +539,9 @@
539539
},
540540
{
541541
"cell_type": "code",
542+
"execution_count": null,
542543
"metadata": {},
544+
"outputs": [],
543545
"source": [
544546
"# Comprehensive cluster health check\n",
545547
"def check_cluster_health(cluster_name, region=\"us-east-2\"):\n",
@@ -596,9 +598,7 @@
596598
"# cluster_health = check_cluster_health(cluster_name)\n",
597599
"\n",
598600
"print(\"Health check function ready. Use after cluster creation is complete.\")"
599-
],
600-
"outputs": [],
601-
"execution_count": null
601+
]
602602
},
603603
{
604604
"cell_type": "markdown",

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ Parameters:
320320
InstanceGroupSettings1:
321321
Type: String
322322
Default: >-
323-
[{"InstanceCount":1,"InstanceGroupName":"controller-group","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}]
323+
[{"InstanceCount":1,"InstanceGroupName":"default","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}]
324324
Description: JSON array string containing instance group configurations.
325325
RigS3BucketName:
326326
Type: String

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class ClusterStackBase(BaseModel):
1717
helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release")
1818
node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty")
1919
node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"")
20-
instance_group_settings: Union[List[Any], None] = Field([{"InstanceCount":1,"InstanceGroupName":"controller-group","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}], description="List of string containing instance group configurations")
20+
instance_group_settings: Union[List[Any], None] = Field([{"InstanceCount":1,"InstanceGroupName":"default","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}], description="List of string containing instance group configurations")
2121
rig_settings: Union[List[Any], None] = Field(None, description="List of string containing restricted instance group configurations")
2222
rig_s3_bucket_name: Optional[str] = Field(None, description="The name of the S3 bucket used to store the RIG resources")
2323
tags: Union[List[Any], None] = Field(None, description="Custom tags for managing the SageMaker HyperPod cluster as an AWS resource")

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
"default": [
182182
{
183183
"InstanceCount": 1,
184-
"InstanceGroupName": "controller-group",
184+
"InstanceGroupName": "default",
185185
"InstanceType": "ml.t3.medium",
186186
"TargetAvailabilityZoneId": "use2-az2",
187187
"ThreadsPerCore": 1,
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import warnings
2+
# Reset warnings and show all except Pydantic serialization warnings
3+
warnings.resetwarnings()
4+
warnings.simplefilter("always")
5+
# Suppress specific Pydantic serialization warnings globally (this is ignored due to customized parsing logic)
6+
warnings.filterwarnings("ignore", message=".*PydanticSerializationUnexpectedValue.*", category=UserWarning)
7+
warnings.filterwarnings("ignore", message=".*serializer.*", category=UserWarning, module="pydantic")
8+
# Suppress kubernetes urllib3 deprecation warning (this is internal dependencies)
9+
warnings.filterwarnings("ignore", message=".*HTTPResponse.getheaders.*", category=DeprecationWarning, module="kubernetes")

src/sagemaker/hyperpod/cli/commands/init.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,16 @@
2525
get_default_version_for_template
2626
)
2727
from sagemaker.hyperpod.common.utils import get_aws_default_region
28+
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
29+
_hyperpod_telemetry_emitter,
30+
)
31+
from sagemaker.hyperpod.common.telemetry.constants import Feature
2832

2933
@click.command("init")
3034
@click.argument("template", type=click.Choice(list(TEMPLATES.keys())))
3135
@click.argument("directory", type=click.Path(file_okay=False), default=".")
3236
@click.option("--version", "-v", default=None, help="Schema version")
37+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_template_cli")
3338
def init(
3439
template: str,
3540
directory: str,
@@ -144,6 +149,7 @@ def init(
144149

145150

146151
@click.command("reset")
152+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_reset_cli")
147153
def reset():
148154
"""
149155
Reset the current directory's config.yaml to an "empty" scaffold:
@@ -176,6 +182,7 @@ def reset():
176182
@click.command("configure")
177183
@generate_click_command()
178184
@click.pass_context
185+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_configure_cli")
179186
def configure(ctx, model_config):
180187
"""
181188
Update any subset of fields in ./config.yaml by passing --<field> flags.
@@ -253,6 +260,7 @@ def configure(ctx, model_config):
253260

254261

255262
@click.command("validate")
263+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_validate_cli")
256264
def validate():
257265
"""
258266
Validate this directory's config.yaml against the appropriate schema.
@@ -263,6 +271,7 @@ def validate():
263271

264272
@click.command(name="_default_create")
265273
@click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.")
274+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli")
266275
def _default_create(region):
267276
"""
268277
Validate configuration and render template files for deployment.

src/sagemaker/hyperpod/cli/constants/init_constants.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -71,37 +71,6 @@ def version_key(v):
7171
'hyp-pytorch-job.volume': _get_handler_from_latest_template("hyp-pytorch-job", "VOLUME_TYPE_HANDLER"),
7272
}
7373

74-
75-
def _get_handler_from_latest_template(template_name, handler_name):
76-
"""Dynamically import handler from the latest version of a template"""
77-
try:
78-
template_info = TEMPLATES[template_name]
79-
registry = template_info["registry"]
80-
81-
# Get latest version using same logic as _get_latest_class
82-
available_versions = list(registry.keys())
83-
def version_key(v):
84-
try:
85-
return tuple(map(int, v.split('.')))
86-
except ValueError:
87-
return (0, 0)
88-
89-
latest_version = max(available_versions, key=version_key)
90-
latest_model = registry[latest_version]
91-
92-
# Get handler from module
93-
module = sys.modules[latest_model.__module__]
94-
return getattr(module, handler_name)
95-
except (ImportError, AttributeError):
96-
return None
97-
98-
99-
# Template.field to handler mapping - avoids conflicts and works reliably
100-
SPECIAL_FIELD_HANDLERS = {
101-
'hyp-pytorch-job.volume': _get_handler_from_latest_template("hyp-pytorch-job", "VOLUME_TYPE_HANDLER"),
102-
}
103-
104-
10574
USAGE_GUIDE_TEXT_CFN = """# SageMaker HyperPod CLI - Initialization Workflow
10675
10776
This document explains the initialization workflow and related commands for the SageMaker HyperPod CLI.

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
import warnings
2-
# Reset warnings and show all except Pydantic serialization warnings
3-
warnings.resetwarnings()
4-
warnings.simplefilter("always")
5-
# Suppress specific Pydantic serialization warnings globally
6-
warnings.filterwarnings("ignore", message=".*PydanticSerializationUnexpectedValue.*", category=UserWarning)
7-
warnings.filterwarnings("ignore", message=".*serializer.*", category=UserWarning, module="pydantic")
8-
91
import click
102
import yaml
113
import json
@@ -56,7 +48,6 @@
5648
)
5749

5850

59-
@click.group(context_settings={'max_content_width': 200})
6051
def get_package_version(package_name):
6152
try:
6253
return version(package_name)

src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,31 @@ def _hyperpod_telemetry_emitter(feature: str, func_name: str):
149149
def decorator(func):
150150
@functools.wraps(func)
151151
def wrapper(*args, **kwargs):
152+
import inspect
153+
sig = inspect.signature(func)
154+
bound_args = sig.bind(*args, **kwargs)
155+
bound_args.apply_defaults()
156+
157+
# Get template value and create template-specific event name
158+
template = bound_args.arguments.get('template')
159+
if template:
160+
event_name = f"{func_name}_{template.replace('-', '_')}"
161+
else:
162+
event_name = func_name
163+
152164
extra = (
153-
f"{func_name}"
165+
f"{event_name}"
154166
f"&x-sdkVersion={SDK_VERSION}"
155167
f"&x-env={PYTHON_VERSION}"
156168
f"&x-sys={OS_NAME_VERSION}"
157169
)
170+
171+
# Add template and version to extra
172+
if template:
173+
extra += f"&x-template={template}"
174+
if 'version' in bound_args.arguments and bound_args.arguments['version']:
175+
extra += f"&x-version={bound_args.arguments['version']}"
176+
158177
start = perf_counter()
159178
try:
160179
result = func(*args, **kwargs)

0 commit comments

Comments
 (0)