diff --git a/examples/sam2_amg_server/cli.py b/examples/sam2_amg_server/cli.py
index 265f8c7b73..b8afcfc3c7 100644
--- a/examples/sam2_amg_server/cli.py
+++ b/examples/sam2_amg_server/cli.py
@@ -5,6 +5,8 @@
 from server import show_anns
 from server import model_type_to_paths
 from server import MODEL_TYPES_TO_MODEL
+from server import set_fast
+from server import set_furious
 from torchao._models.sam2.build_sam import build_sam2
 from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
 from torchao._models.sam2.utils.amg import rle_to_mask
@@ -19,13 +21,17 @@ def main_docstring():
         output_path (str): Path to output image
     """
 
-def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=1024, output_format='png', verbose=False):
+def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=1024, output_format='png', verbose=False, fast=False, furious=False):
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
     if verbose:
         print(f"Loading model {sam2_checkpoint} with config {model_cfg}")
     sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
     mask_generator = SAM2AutomaticMaskGenerator(sam2, points_per_batch=points_per_batch, output_mode="uncompressed_rle")
+    if fast:
+        set_fast(mask_generator)
+    if furious:
+        set_furious(mask_generator)
     image_tensor = file_bytes_to_image_tensor(bytearray(open(input_path, 'rb').read()))
     if verbose:
         print(f"Loaded image of size {tuple(image_tensor.shape)} and generating mask.")
diff --git a/examples/sam2_amg_server/cli_on_modal.py b/examples/sam2_amg_server/cli_on_modal.py
new file mode 100644
index 0000000000..fdd6316b27
--- /dev/null
+++ b/examples/sam2_amg_server/cli_on_modal.py
@@ -0,0 +1,94 @@
+from pathlib import Path
+
+import modal
+
+app = modal.App("torchao-sam-2-cli")
+
+TARGET = "/root/"
+
+image = (
+    modal.Image.debian_slim(python_version="3.12.7")
+    .pip_install("numpy<3", "tqdm")
+    .pip_install(
+        "torch",
+        pre=True,
+        index_url="https://download.pytorch.org/whl/nightly/cu124",  # tested with torch-2.6.0.dev20241120
+    )
+    .pip_install(
+        "torchvision",
+        pre=True,
+        index_url="https://download.pytorch.org/whl/nightly/cu124",  # tested with torch-2.6.0.dev20241120
+    )
+    .apt_install("git")
+    .apt_install("libopencv-dev")
+    .apt_install("python3-opencv")
+    .run_commands(["git clone https://github.com/pytorch/ao.git /tmp/ao_src"])
+    .run_commands(["cd /tmp/ao_src; python setup.py develop"])
+    .pip_install(
+        "gitpython",
+    )
+    .apt_install("wget")
+    .run_commands([f"wget https://raw.githubusercontent.com/pytorch/ao/refs/heads/main/examples/sam2_amg_server/requirements.txt"])
+    .pip_install_from_requirements(
+        'requirements.txt',
+    )
+)
+
+checkpoints = modal.Volume.from_name("checkpoints", create_if_missing=True)
+
+@app.function(
+    image=image,
+    gpu="H100",
+    volumes={
+        TARGET + "checkpoints": checkpoints,
+        # # mount the caches of torch.compile and friends
+        # "/root/.nv": modal.Volume.from_name("torchao-sam-2-cli-nv-cache", create_if_missing=True),
+        # "/root/.triton": modal.Volume.from_name(
+        #     "torchao-sam-2-cli-triton-cache", create_if_missing=True
+        # ),
+        # "/root/.inductor-cache": modal.Volume.from_name(
+        #     "torchao-sam-2-cli-inductor-cache", create_if_missing=True
+        # ),
+    },
+    timeout=60 * 60,
+)
+def eval(input_bytes, fast, furious):
+    import torch
+    import torchao
+    import os
+
+    import subprocess
+    from pathlib import Path
+    from git import Repo
+
+    def download_file(url, filename):
+        command = f"wget -O {filename} {url}"
+        subprocess.run(command, shell=True, check=True)
+
+    os.chdir(Path(TARGET))
+    download_file("https://raw.githubusercontent.com/pytorch/ao/refs/heads/climodal1/examples/sam2_amg_server/cli.py", "cli.py")
+    download_file("https://raw.githubusercontent.com/pytorch/ao/refs/heads/climodal1/examples/sam2_amg_server/server.py", "server.py")
+    # Create a Path object for the current directory
+    current_directory = Path('.')
+
+    with open('/tmp/dog.jpg', 'wb') as file:
+        file.write(input_bytes)
+
+    import sys
+    sys.path.append(".")
+    from cli import main as cli_main
+    cli_main(Path(TARGET) / Path("checkpoints"),
+             model_type="large",
+             input_path="/tmp/dog.jpg",
+             output_path="/tmp/dog_masked_2.png",
+             verbose=True,
+             fast=fast,
+             furious=furious)
+          
+    return bytearray(open('/tmp/dog_masked_2.png', 'rb').read())
+
+@app.local_entrypoint()
+def main(input_path, output_path, fast=False, furious=False):
+    bytes = eval.remote(open(input_path, 'rb').read(), fast, furious)
+    with open(output_path, "wb") as file:
+        file.write(bytes)
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
index cbf916c2aa..d779411c93 100644
--- a/examples/sam2_amg_server/server.py
+++ b/examples/sam2_amg_server/server.py
@@ -332,6 +332,39 @@ def model_type_to_paths(checkpoint_path, model_type):
     model_cfg = f"configs/sam2.1/{MODEL_TYPES_TO_CONFIG[model_type]}"
     return sam2_checkpoint, model_cfg
 
+def set_fast(mask_generator):
+    # TODO: Using CUDA graphs can cause numerical differences?
+    mask_generator.predictor.model.image_encoder = torch.compile(
+        mask_generator.predictor.model.image_encoder,
+        mode="max-autotune",
+        fullgraph=True,
+        dynamic=False,
+    )
+
+    mask_generator.predictor._predict_masks = torch.compile(
+        mask_generator.predictor._predict_masks,
+        mode="max-autotune",
+        fullgraph=True,
+        dynamic=False,
+    )
+
+    # mask_generator.predictor._predict_masks_postprocess = torch.compile(
+    #     mask_generator.predictor._predict_masks_postprocess,
+    #     fullgraph=True,
+    #     dynamic=True,
+    # )
+
+
+def set_furious(mask_generator):
+    mask_generator.predictor.model.image_encoder = mask_generator.predictor.model.image_encoder.to(torch.float16)
+    # NOTE: Not baseline feature
+    mask_generator.predictor._image_dtype = torch.float16
+    mask_generator.predictor._transforms_device = mask_generator.predictor.device
+    torch.set_float32_matmul_precision('high')
+    mask_generator.predictor.model.sam_mask_decoder = mask_generator.predictor.model.sam_mask_decoder.to(torch.float16)
+    # NOTE: Not baseline feature
+    mask_generator.predictor.model.sam_mask_decoder._src_dtype = torch.float16
+
 
 def main(checkpoint_path,
          model_type,
@@ -378,36 +411,10 @@ def main(checkpoint_path,
 
     if fast:
         assert not baseline, "--fast cannot be combined with baseline. code to be torch.compile(fullgraph=True) compatible."
-        # TODO: Using CUDA graphs can cause numerical differences?
-        mask_generator.predictor.model.image_encoder = torch.compile(
-            mask_generator.predictor.model.image_encoder,
-            mode="max-autotune",
-            fullgraph=True,
-            dynamic=False,
-        )
-
-        mask_generator.predictor._predict_masks = torch.compile(
-            mask_generator.predictor._predict_masks,
-            mode="max-autotune",
-            fullgraph=True,
-            dynamic=False,
-        )
-
-        # mask_generator.predictor._predict_masks_postprocess = torch.compile(
-        #     mask_generator.predictor._predict_masks_postprocess,
-        #     fullgraph=True,
-        #     dynamic=True,
-        # )
+        set_fast(mask_generator)
 
     if furious:
-        mask_generator.predictor.model.image_encoder = mask_generator.predictor.model.image_encoder.to(torch.float16)
-        # NOTE: Not baseline feature
-        mask_generator.predictor._image_dtype = torch.float16
-        mask_generator.predictor._transforms_device = mask_generator.predictor.device
-        torch.set_float32_matmul_precision('high')
-        mask_generator.predictor.model.sam_mask_decoder = mask_generator.predictor.model.sam_mask_decoder.to(torch.float16)
-        # NOTE: Not baseline feature
-        mask_generator.predictor.model.sam_mask_decoder._src_dtype = torch.float16
+        set_furious(mask_generator)
 
     with open('dog.jpg', 'rb') as f:
         image_tensor = file_bytes_to_image_tensor(bytearray(f.read()))