diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 7852377d..52eb4a81 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -18,24 +18,31 @@ jobs:
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
         python-version: '3.10'
-
+    
     - name: Create input files
-      shell: bash
-      run: |
-        # Extract the payload content without printing it
-        PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
-        # Apply mask to the extracted content
-        echo "::add-mask::$PAYLOAD"
-        
-        # Now write to file (won't be logged since it's masked)
-        echo "$PAYLOAD" > payload.json
+      uses: nick-fields/retry@v3
+      with:
+        timeout_minutes: 2
+        max_attempts: 5
+        shell: bash
+        command: |
+          # install jq
+          apt update
+          apt install -y jq
+          # Extract the payload content without printing it
+          PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
+          
+          # Apply mask to the extracted content
+          echo "::add-mask::$PAYLOAD"
+          
+          # Now write to file (won't be logged since it's masked)
+          echo "$PAYLOAD" > payload.json
 
     - name: Install uv
       uses: astral-sh/setup-uv@v3
diff --git a/examples/eval.py b/examples/eval.py
index e414a580..37d3eef9 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -17,7 +17,7 @@
 except ImportError:
     TestSpec = dict
 
-from reference import check_implementation, generate_input
+from reference import check_implementation, generate_input, ref_kernel
 
 
 class PopcornOutput:
@@ -198,18 +198,21 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
         return 112
 
 
-def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool) -> Stats | Any:
     """
     Runs one benchmark. Do not call directly.
     """
-    from submission import custom_kernel
+    if not is_baseline_run:
+        # submission does not exist for a baseline run
+        from submission import custom_kernel
 
     durations = []
     # generate input data once
     data = generate_input(**test.args)
     check_copy = _clone_data(data)
+    active_kernel = ref_kernel if is_baseline_run else custom_kernel
     #  first, one obligatory correctness check
-    output = custom_kernel(data)
+    output = active_kernel(data)
     good, message = wrap_check_implementation(check_copy, output)
     if not good:
         return message
@@ -229,12 +232,12 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
         start = time.perf_counter_ns()
-        output = custom_kernel(data)
+        output = active_kernel(data)
         torch.cuda.synchronize()
         end = time.perf_counter_ns()
 
         if recheck:
-            good, message = check_implementation(check_copy, output)
+            good, message = wrap_check_implementation(check_copy, output)
             if not good:
                 return message
 
@@ -249,7 +252,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     return calculate_stats(durations)
 
 
-def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float, is_baseline_run: bool = False):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
 
@@ -260,7 +263,7 @@ def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bo
     @param max_time_ns: Timeout time in nanoseconds.
     @return: A Stats object for this particular benchmark case or an error if the test fails.
     """
-    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns, is_baseline_run))
 
 
 def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
@@ -300,13 +303,13 @@ def run_single_profile(test: TestCase) -> str:
     """
     Runs a single test case. Do not call directly
     """
-    from submission import custom_kernel
     from torch.profiler import profile, record_function, ProfilerActivity
+    from submission import custom_kernel
     data = generate_input(**test.args)
     torch.cuda.synchronize()
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data))
+        submission_output = active_kernel(_clone_data(data))
         torch.cuda.synchronize()
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
@@ -327,9 +330,9 @@ def main():
         return 111
 
     if len(sys.argv) < 3:
-        return 2
+        return 222
 
-    mode = sys.argv[1]
+    mode = sys.argv[1].strip()
     seed = os.getenv("POPCORN_SEED")
     os.unsetenv("POPCORN_SEED")
     seed = int(seed) if seed else None
@@ -345,13 +348,14 @@ def main():
             if mode == "benchmark":
                 return run_benchmarking(logger, pool, tests)
 
-            if mode == "leaderboard":
+            if (mode == "leaderboard") or (mode == "baseline"):
+                is_baseline_run = mode == "baseline"
                 # warmup
-                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                run_single_benchmark(pool, tests[0], False, 100, 1e7, is_baseline_run)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9, is_baseline_run)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):
@@ -367,7 +371,9 @@ def main():
                 run_profiling(logger, tests)
             else:
                 # TODO: Implement script mode
-                return 2
+                logger.log(mode, "not implemented")
+                print(f"mode {mode} not implemented")
+                return 333
 
 
 if __name__ == "__main__":
diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py
index 8e39ee2f..a20c9bb3 100644
--- a/src/discord-cluster-manager/cogs/admin_cog.py
+++ b/src/discord-cluster-manager/cogs/admin_cog.py
@@ -10,7 +10,8 @@
 import discord
 import env
 import yaml
-from consts import GitHubGPU, ModalGPU
+from cogs.leaderboard_cog import LeaderboardSubmitCog
+from consts import GitHubGPU, ModalGPU, SubmissionMode
 from discord import app_commands
 from discord.ext import commands, tasks
 from leaderboard_db import leaderboard_name_autocomplete
@@ -120,6 +121,10 @@ def __init__(self, bot: "ClusterBot"):
             name="set-forum-ids", description="Sets forum IDs"
         )(self.set_forum_ids)
 
+        self.baseline_run = bot.admin_group.command(
+            name="baseline-run", description="Create a baseline run for a leaderboard"
+        )(self.baseline_run)
+
         self._scheduled_cleanup_temp_users.start()
 
     # --------------------------------------------------------------------------
@@ -1025,3 +1030,58 @@ async def set_forum_ids(self, interaction: discord.Interaction):
             error_message = f"Error updating forum ids: {str(e)}"
             logger.error(error_message, exc_info=True)
             await send_discord_message(interaction, error_message, ephemeral=True)
+
+    # ----------------------------------------------------------------------
+    # Baseline run submission (admin only)
+    # ----------------------------------------------------------------------
+    @discord.app_commands.describe(
+        leaderboard_name="Name of the leaderboard to create a baseline run for",
+        gpu="GPU(s) to use; leave empty for interactive selection",
+        force="Create another baseline run even if one already exists.",
+    )
+    @discord.app_commands.autocomplete(
+        leaderboard_name=leaderboard_name_autocomplete,
+    )
+    @with_error_handling
+    async def baseline_run(
+        self,
+        interaction: discord.Interaction,
+        leaderboard_name: str,
+        gpu: Optional[str] = None,
+        force: bool = False,
+    ):
+        """Admin command to create (or force-create) a baseline run."""
+
+        # Ensure caller is admin
+        is_admin = await self.admin_check(interaction)
+        if not is_admin:
+            await send_discord_message(
+                interaction,
+                "You need Admin permissions to run this command.",
+                ephemeral=True,
+            )
+            return
+
+        # Check for existing baseline run unless forcing
+        if not force:
+            with self.bot.leaderboard_db as db:
+                if db.has_baseline_run(leaderboard_name):
+                    await send_discord_message(
+                        interaction,
+                        (
+                            "A baseline run already exists for this leaderboard. "
+                            "Use the 'force' flag to create another."
+                        ),
+                        ephemeral=True,
+                    )
+                    return
+
+        lb_cog = LeaderboardSubmitCog(self.bot)
+
+        await lb_cog.submit(
+            interaction=interaction,
+            leaderboard_name=leaderboard_name,
+            script=None,
+            mode=SubmissionMode.BASELINE,
+            gpu=gpu,
+        )
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
index fadc9c41..354752f7 100644
--- a/src/discord-cluster-manager/cogs/leaderboard_cog.py
+++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -5,6 +5,8 @@
 
 import discord
 from consts import (
+    BASELINE_USER,
+    BASELINE_USER_ID,
     SubmissionMode,
     get_gpu_by_name,
 )
@@ -63,28 +65,51 @@ async def on_submit_hook(  # noqa: C901
         self,
         interaction: discord.Interaction,
         leaderboard_name: Optional[str],
-        script: discord.Attachment,
+        script: Optional[discord.Attachment],
         mode: SubmissionMode,
         cmd_gpus: Optional[List[str]],
     ) -> int:
         """
         Called as the main body of a submission to route to the correct runner.
         """
-        # Read the template file
-        submission_content = await script.read()
 
-        try:
-            submission_content = submission_content.decode()
-        except UnicodeError:
-            await send_discord_message(
-                interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True
-            )
-            return -1
+        if script is None:
+            if mode != SubmissionMode.BASELINE and not script:
+                await send_discord_message(
+                    interaction,
+                    "Script attachment is required for this unless submission mode is baseline",
+                    ephemeral=True,
+                )
+                return -1
+            else:
+                submission_content = ""
+        else:
+            # Read the template file
+            submission_content = await script.read()
+
+            try:
+                submission_content = submission_content.decode()
+            except UnicodeError:
+                await send_discord_message(
+                    interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True
+                )
+                return -1
+        if mode == SubmissionMode.BASELINE:
+            # create fake baseline submission
+            file_name = None
+            submission_content = None
+            user_id = BASELINE_USER_ID
+            user_name = BASELINE_USER
+        else:
+            file_name = script.filename
+            submission_content = submission_content
+            user_id = interaction.user.id
+            user_name = interaction.user.global_name or interaction.user.name
 
         req = SubmissionRequest(
             code=submission_content,
-            file_name=script.filename,
-            user_id=interaction.user.id,
+            file_name=file_name,
+            user_id=user_id,
             gpus=cmd_gpus,
             leaderboard=leaderboard_name,
         )
@@ -105,26 +130,28 @@ async def on_submit_hook(  # noqa: C901
 
         command = self.bot.get_cog("SubmitCog").submit_leaderboard
 
-        user_name = interaction.user.global_name or interaction.user.name
         # Create a submission entry in the database
         with self.bot.leaderboard_db as db:
             sub_id = db.create_submission(
                 leaderboard=req.leaderboard,
-                file_name=script.filename,
+                file_name=file_name,
                 code=submission_content,
-                user_id=interaction.user.id,
+                user_id=user_id,
                 time=datetime.now(),
                 user_name=user_name,
             )
+        if mode == SubmissionMode.BASELINE:
+            run_msg = f"Submission **{sub_id}**: is a baseline submission for `{req.leaderboard}`"
+        else:
+            run_msg = f"Submission **{sub_id}**: `{file_name}` for `{req.leaderboard}`"
 
-        run_msg = f"Submission **{sub_id}**: `{script.filename}` for `{req.leaderboard}`"
         reporter = MultiProgressReporter(interaction, run_msg)
         try:
             tasks = [
                 command(
                     sub_id,
                     submission_content,
-                    script.filename,
+                    file_name,
                     gpu,
                     reporter.add_run(f"{gpu.name} on {gpu.runner}"),
                     req.task,
@@ -140,7 +167,7 @@ async def on_submit_hook(  # noqa: C901
                     command(
                         sub_id,
                         submission_content,
-                        script.filename,
+                        file_name,
                         gpu,
                         reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
                         req.task,
@@ -155,7 +182,7 @@ async def on_submit_hook(  # noqa: C901
             with self.bot.leaderboard_db as db:
                 db.mark_submission_done(sub_id)
 
-        if mode == SubmissionMode.LEADERBOARD:
+        if mode == SubmissionMode.LEADERBOARD or mode == SubmissionMode.BASELINE:
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
@@ -194,16 +221,15 @@ def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem):
     async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         with self.bot.leaderboard_db as db:
             sub_data: SubmissionItem = db.get_submission_by_id(sub_id)
-
         result_lines = []
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and run["mode"] == SubmissionMode.LEADERBOARD.value
+                and (run["mode"] == SubmissionMode.LEADERBOARD.value
+                or run["mode"] == SubmissionMode.BASELINE.value)
                 and run["passed"]
             ):
                 result_lines.append(self.generate_run_verdict(run, sub_data))
-
         if len(result_lines) > 0:
             await send_discord_message(
                 interaction,
@@ -224,10 +250,19 @@ async def submit(
         self,
         interaction: discord.Interaction,
         leaderboard_name: Optional[str],
-        script: discord.Attachment,
+        script: Optional[discord.Attachment],
         mode: SubmissionMode,
         gpu: Optional[str],
     ):
+
+        if not mode == SubmissionMode.BASELINE and not script:
+            await send_discord_message(
+                interaction,
+                "Script attachment is required for this unless submission mode is baseline",
+                ephemeral=True,
+            )
+            return
+
         if not self.bot.accepts_jobs:
             await send_discord_message(
                 interaction,
@@ -319,7 +354,6 @@ async def submit_ranked(
             interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
         )
 
-
 async def lang_autocomplete(
     interaction: discord.Interaction,
     current: str,
diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py
index 0657641f..0ac10e2a 100644
--- a/src/discord-cluster-manager/cogs/submit_cog.py
+++ b/src/discord-cluster-manager/cogs/submit_cog.py
@@ -104,12 +104,21 @@ async def submit_leaderboard(  # noqa: C901
         if result.success:
             score = None
             if (
-                "leaderboard" in result.runs
+                ("leaderboard" in result.runs
                 and result.runs["leaderboard"].run.success
-                and result.runs["leaderboard"].run.passed
+                and result.runs["leaderboard"].run.passed)
+                or ("baseline" in result.runs
+                and result.runs["baseline"].run.success
+                and result.runs["baseline"].run.passed)
             ):
+                if "leaderboard" in result.runs:
+                    key = "leaderboard"
+                elif "baseline" in result.runs:
+                    key = "baseline"
+                else:
+                    raise KernelBotError("Leaderboard or baseline run failed")
                 score = 0.0
-                num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
+                num_benchmarks = int(result.runs[key].run.result["benchmark-count"])
                 if task.ranking_by == RankCriterion.LAST:
                     if num_benchmarks != 1:
                         logger.error(
@@ -122,19 +131,17 @@ async def submit_leaderboard(  # noqa: C901
                             f"Expected submission to have exactly one benchmark,"
                             f"got {num_benchmarks}."
                         )
-                    score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
+                    score = float(result.runs[key].run.result["benchmark.0.mean"]) / 1e9
                 else:
                     scores = []
                     for i in range(num_benchmarks):
                         scores.append(
-                            float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"])
-                            / 1e9
+                            float(result.runs[key].run.result[f"benchmark.{i}.mean"]) / 1e9
                         )
                     if task.ranking_by == RankCriterion.MEAN:
                         score = sum(scores) / len(scores)
                     elif task.ranking_by == RankCriterion.GEOM:
                         score = math.pow(math.prod(scores), 1.0 / num_benchmarks)
-
             # verifyruns uses a fake submission id of -1
             if submission_id != -1:
                 with self.bot.leaderboard_db as db:
@@ -145,7 +152,7 @@ async def submit_leaderboard(  # noqa: C901
                             value.end,
                             mode=key,
                             runner=gpu_type.name,
-                            score=None if key != "leaderboard" else score,
+                            score=None if (key != "leaderboard" and key != "baseline") else score,
                             secret=mode == SubmissionMode.PRIVATE,
                             compilation=value.compilation,
                             result=value.run,
diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
index 928f59d4..03ada325 100644
--- a/src/discord-cluster-manager/consts.py
+++ b/src/discord-cluster-manager/consts.py
@@ -80,7 +80,7 @@ class SubmissionMode(Enum):
     """
     Different types of submission that can be made:
     Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
-    Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
+    Benchmark: Run larger benchmarks. Each benchmark is tested once, then run multiple times.
     Profile: Gather profiling information. One selected benchmark is run under the profiler. No
         testing is performed in this mode (sometimes, you need to profile deliberately broken code)
     Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
@@ -97,7 +97,7 @@ class SubmissionMode(Enum):
     LEADERBOARD = "leaderboard"
     PRIVATE = "private"
     SCRIPT = "script"
-
+    BASELINE = "baseline"
 
 class Language(Enum):
     Python = "py"
@@ -157,3 +157,7 @@ class RankCriterion(Enum):
 --index-url https://download.pytorch.org/whl/rocm6.2.4
 torch
 """
+
+# Constants used for baseline runs
+BASELINE_USER = "BASELINE_USER"
+BASELINE_USER_ID = -123
diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py
index d48e8404..69a08d7a 100644
--- a/src/discord-cluster-manager/leaderboard_db.py
+++ b/src/discord-cluster-manager/leaderboard_db.py
@@ -6,6 +6,7 @@
 
 import discord
 import psycopg2
+from consts import BASELINE_USER, BASELINE_USER_ID
 from env import (
     DATABASE_URL,
     DISABLE_SSL,
@@ -213,6 +214,11 @@ def create_submission(
         time: datetime.datetime,
         user_name: str = None,
     ) -> Optional[int]:
+        if user_id == BASELINE_USER_ID and user_name == BASELINE_USER:
+            # todo: add reference code to the database
+            code = ""
+            file_name = "reference.py"
+
         try:
             # check if we already have the code
             self.cursor.execute(
@@ -287,6 +293,22 @@ def create_submission(
             self.connection.rollback()  # Ensure rollback if error occurs
             raise KernelBotError("Error during creation of submission") from e
 
+    def has_baseline_run(self, leaderboard_name: str) -> bool:
+        try:
+            self.cursor.execute(
+                """
+                SELECT COUNT(*) FROM leaderboard.runs r
+                JOIN leaderboard.submission s ON r.submission_id = s.id
+                JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id
+                WHERE l.name = %s AND s.user_id = %s
+                """,
+                (leaderboard_name, str(BASELINE_USER_ID)),
+            )
+            return self.cursor.fetchone()[0] > 0
+        except psycopg2.Error as e:
+            logger.error("Error checking for reference run", exc_info=e)
+            return False
+
     def mark_submission_done(
         self,
         submission: int,
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
index 4e09c2c5..07e3b517 100644
--- a/src/discord-cluster-manager/report.py
+++ b/src/discord-cluster-manager/report.py
@@ -160,6 +160,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     Creates a minimalistic report for `runs`,
     returned as a list of status strings
     """
+
     any_compile = False
     result = []
     for r in runs.values():
@@ -218,6 +219,16 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             result.append("✅ Leaderboard run successful")
     elif full:
         result.append("❌ Leaderboard missing")
+
+    if "baseline" in runs:
+        ref_run = runs["baseline"].run
+        if not ref_run.success:
+            result.append("❌ Running baseline failed" + _short_fail_reason(ref_run))
+        elif not ref_run.passed:
+            result.append("❌ Baseline run failed")
+        else:
+            result.append("✅ Baseline run successful")
+
     return result
 
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 5e7ab046..9cc23bea 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -218,7 +218,6 @@ def compile_cuda_script(  # # noqa: C901
 
 
 def run_program(args: list[str], seed: Optional[int], timeout: int) -> RunResult:
-    print("[Running]")
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
@@ -296,7 +295,7 @@ def run_single_evaluation(
             tests_file.write(tests)
             tests_file.flush()
             return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout)
-    elif mode in ["benchmark", "profile", "leaderboard"]:
+    elif mode in ["benchmark", "profile", "leaderboard", "baseline"]:
         timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
         with tempfile.NamedTemporaryFile("w") as bench_file:
             if ranking_by == "last":
@@ -432,6 +431,7 @@ def run_cuda_script(  # # noqa: C901
 def run_pytorch_script(  # noqa: C901
     sources: dict[str, str],
     main: str,
+    is_baseline: bool = False,
     **kwargs,
 ) -> EvalResult:
     """
@@ -448,38 +448,39 @@ def run_pytorch_script(  # noqa: C901
     start = datetime.datetime.now()
     try:
         assert main in sources.keys()
-
-        # Write submission files to directory
         _create_files(sources)
 
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
-        try:
-            compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
-            if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
+        comp = None
+        if not is_baseline:
+            try:
+                compile_run = run_program(["python",
+                "submission.py"],
+                 seed=1,
+                 timeout=Timeout.COMPILE)
+                if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
+                    comp = CompileResult(
+                        nvcc_found=True,
+                        nvcc_version="",
+                        success=True,
+                        command=compile_run.command,
+                        stdout=compile_run.stdout,
+                        stderr=compile_run.stderr,
+                        exit_code=compile_run.exit_code,
+                    )
+            except subprocess.CalledProcessError as e:
+                # This step is purely optional, so we just go on
+                # if it fails
                 comp = CompileResult(
-                    nvcc_found=True,
+                    nvcc_found=False,
                     nvcc_version="",
-                    success=True,
-                    command=compile_run.command,
-                    stdout=compile_run.stdout,
-                    stderr=compile_run.stderr,
-                    exit_code=compile_run.exit_code,
+                    success=False,
+                    command="python submission.py",
+                    stdout=e.stdout,
+                    stderr=e.stderr,
+                    exit_code=e.returncode,
                 )
-            else:
-                comp = None
-        except subprocess.CalledProcessError as e:
-            # This step is purely optional, so we just go on
-            # if it fails
-            comp = CompileResult(
-                nvcc_found=False,
-                nvcc_version="",
-                success=False,
-                command="python submission.py",
-                stdout=e.stdout,
-                stderr=e.stderr,
-                exit_code=e.returncode,
-            )
 
         run = run_single_evaluation(["python", main], **kwargs)
 
@@ -511,7 +512,7 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile", "script"]:
+    if mode in ["test", "benchmark", "profile", "script", "baseline"]:
         results[mode] = call(mode=mode)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
@@ -528,7 +529,7 @@ def run_evaluation(
         # if they pass, run the leaderboard validation
         results["leaderboard"] = call(mode="leaderboard")
     else:
-        raise AssertionError("Invalid mode")
+        raise AssertionError(f"Invalid mode: {mode}")
 
     return results
 
@@ -544,6 +545,12 @@ def build_test_string(tests: list[dict]):
 
 
 def run_config(config: dict):
+    mode = config["mode"]
+    is_baseline = False
+    if mode == "baseline":
+        config["sources"].pop("submission.py", None)
+        is_baseline = True
+
     common_args = {
         "tests": build_test_string(config.get("tests", [])),
         "benchmarks": build_test_string(config.get("benchmarks", [])),
@@ -558,6 +565,7 @@ def run_config(config: dict):
             run_pytorch_script,
             sources=config["sources"],
             main=config["main"],
+            is_baseline=is_baseline,
             **common_args,
         )
     elif config["lang"] == "cu":
diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py
index 2777b15f..37c14de6 100644
--- a/src/discord-cluster-manager/submission.py
+++ b/src/discord-cluster-manager/submission.py
@@ -27,19 +27,31 @@ class ProcessedSubmissionRequest(SubmissionRequest):
 
 
 def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest:
-    if profanity.contains_profanity(req.file_name):
-        raise KernelBotError("Please provide a non rude filename")
+    # Detect reference submissions (no file name & no code provided)
+    # A reference submission is identified by missing/empty code content (no user file)
+    is_baseline_submission = not req.code
 
-    # check file extension
-    if not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")):
+    # Perform filename/content related checks only for *non* reference submissions
+    if not is_baseline_submission:
+        if profanity.contains_profanity(req.file_name):
+            raise KernelBotError("Please provide a non rude filename")
+
+        # check file extension (if filename provided)
+        if req.file_name and not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")):
+            raise KernelBotError(
+                "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
+            )
+
+        # process file directives (GPU selection / leaderboard name)
+        req = handle_popcorn_directives(req)
+
+    # Ensure leaderboard name is present (might have come from the command directly)
+    if req.leaderboard is None:
         raise KernelBotError(
-            "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
+            "Missing leaderboard name. Either supply one as a command \
+                argument or via ``#!POPCORN leaderboard <name>`` directive.",
         )
 
-    # process file directives
-    req = handle_popcorn_directives(req)
-    assert req.leaderboard is not None
-
     leaderboard = lookup_leaderboard(req.leaderboard, lb_db)
     check_deadline(leaderboard)
 
@@ -117,14 +129,6 @@ def handle_popcorn_directives(req: SubmissionRequest) -> SubmissionRequest:
         else:
             req.leaderboard = info["leaderboard"]
 
-    if req.leaderboard is None:
-        raise KernelBotError(
-            "Missing leaderboard name. "
-            "Either supply one as an argument in the submit command, or "
-            "specify it in your submission script using the "
-            "`{#,//}!POPCORN leaderboard <leaderboard_name>` directive.",
-        )
-
     return req
 
 
diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py
index 3a14bc51..8641d4f7 100644
--- a/src/discord-cluster-manager/task.py
+++ b/src/discord-cluster-manager/task.py
@@ -65,6 +65,7 @@ class LeaderboardTask:
     templates: dict[str, str] = dataclasses.field(default_factory=dict)
     seed: Optional[int] = None
 
+
     @staticmethod
     def from_dict(data: dict):
         data_ = copy.copy(data)
diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py
index c39192f7..d63a44e1 100644
--- a/src/discord-cluster-manager/utils.py
+++ b/src/discord-cluster-manager/utils.py
@@ -245,7 +245,10 @@ def build_task_config(
 
         if lang == "py":
             config["main"] = "eval.py"
-
+        args = []
+        if mode == SubmissionMode.BASELINE:
+            submission_content = ""
+        config["args"] = args
         return {
             **config,
             "sources": {
@@ -259,7 +262,6 @@ def build_task_config(
                 all_files[n] = submission_content
             else:
                 all_files[n] = c
-
         common = {
             "lang": task.lang.value,
             "arch": arch,