Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ hyp create hyp-pytorch-job \
--version 1.0 \
--job-name test-pytorch-job \
--image pytorch/pytorch:latest \
--command '["python", "train.py"]' \
--args '["--epochs", "10", "--batch-size", "32"]' \
--command '[python, train.py]' \
--args '[--epochs=10, --batch-size=32]' \
--environment '{"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32"}' \
--pull-policy "IfNotPresent" \
--instance-type ml.p4d.24xlarge \
Expand All @@ -170,8 +170,8 @@ hyp create hyp-pytorch-job \
--queue-name "training-queue" \
--priority "high" \
--max-retry 3 \
--volumes '["data-vol", "model-vol", "checkpoint-vol"]' \
--persistent-volume-claims '["shared-data-pvc", "model-registry-pvc"]' \
--volumes '[data-vol, model-vol, checkpoint-vol]' \
--persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' \
--output-s3-uri s3://my-bucket/model-artifacts
```

Expand Down Expand Up @@ -257,9 +257,10 @@ Along with the CLI, we also have SDKs available that can perform the training an

```

from sagemaker.hyperpod import HyperPodPytorchJob
from sagemaker.hyperpod.job
import ReplicaSpec, Template, Spec, Container, Resources, RunPolicy, Metadata
from sagemaker.hyperpod.training import HyperPodPytorchJob
from sagemaker.hyperpod.training
import ReplicaSpec, Template, Spec, Containers, Resources, RunPolicy
from sagemaker.hyperpod.common.config import Metadata

# Define job specifications
nproc_per_node = "1" # Number of processes per node
Expand All @@ -274,7 +275,7 @@ replica_specs =
(
containers =
[
Container
Containers
(
# Container name
name="container-name",
Expand Down
24 changes: 22 additions & 2 deletions examples/training/CLI/training-e2e-cli.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,31 @@
]
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "!hyp list-cluster --output table",
"id": "9df747dbfa211453"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"id": "b30debba",
"source": "!hyp set-cluster-context --cluster-name <cluster-name>",
"id": "8db986d2b42a9e88"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"source": "!hyperpod get-clusters"
"execution_count": null,
"source": [
"#verify the cluster context\n",
"!hyp get-cluster-context "
],
"id": "ba996d7dc8e128d5"
},
{
"metadata": {
Expand All @@ -46,6 +65,7 @@
"metadata": {},
"outputs": [],
"source": [
"#example command\n",
"!hyp create hyp-pytorch-job \\\n",
" --version 1.0 \\\n",
" --job-name test-pytorch-job-cli \\\n",
Expand Down
Loading