Initial support CUDARequirement extension (#1581)

tetron · web-flow · commit cbc79e09c46b · 2022-01-03T20:10:22.000Z
Add CUDARequirement extension, standards track for future CWL spec.

Checks for presence of CUDA using nvidia-smi.
Includes support using Docker (--gpus) and Singularity (--nv).
diff --git a/cwltool/cuda.py b/cwltool/cuda.py
@@ -0,0 +1,44 @@
+import subprocess  # nosec
+import xml.dom.minidom  # nosec
+from .loghandler import _logger
+from .utils import CWLObjectType
+
+from typing import Tuple, cast
+
+
+def cuda_version_and_device_count() -> Tuple[str, int]:
+    try:
+        out = subprocess.check_output(["nvidia-smi", "-q", "-x"])  # nosec
+    except Exception as e:
+        _logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
+        return ("", 0)
+    dm = xml.dom.minidom.parseString(out)  # nosec
+    ag = dm.getElementsByTagName("attached_gpus")[0].firstChild
+    cv = dm.getElementsByTagName("cuda_version")[0].firstChild
+    return (cv.data, int(ag.data))
+
+
+def cuda_check(cuda_req: CWLObjectType) -> int:
+    try:
+        vmin = float(str(cuda_req["cudaVersionMin"]))
+        version, devices = cuda_version_and_device_count()
+        if version == "":
+            # nvidia-smi not detected, or failed some other way
+            return 0
+        versionf = float(version)
+        if versionf < vmin:
+            _logger.warning(
+                "CUDA version '%s' is less than minimum version '%s'", version, vmin
+            )
+            return 0
+        dmin = cast(int, cuda_req.get("deviceCountMin", 1))
+        dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
+        if devices < dmin:
+            _logger.warning(
+                "Requested at least %d GPU devices but only %d available", dmin, devices
+            )
+            return 0
+        return min(dmax, devices)
+    except Exception as e:
+        _logger.warning("Error checking CUDA requirements: %s", e)
+        return 0
diff --git a/cwltool/docker.py b/cwltool/docker.py
@@ -15,6 +15,7 @@
 
 from .builder import Builder
 from .context import RuntimeContext
+from .cuda import cuda_check
 from .docker_id import docker_vm_id
 from .errors import WorkflowException
 from .job import ContainerCommandLineJob
@@ -395,6 +396,14 @@ def create_runtime(
         if runtimeContext.rm_container:
             runtime.append("--rm")
 
+        cuda_req, _ = self.builder.get_requirement(
+            "http://commonwl.org/cwltool#CUDARequirement"
+        )
+        if cuda_req:
+            # Checked earlier that the device count is non-zero in _setup
+            count = cuda_check(cuda_req)
+            runtime.append("--gpus=" + str(count))
+
         cidfile_path = None  # type: Optional[str]
         # add parameters to docker to write a container ID file
         if runtimeContext.user_space_docker_cmd is None:
diff --git a/cwltool/extensions-v1.1.yml b/cwltool/extensions-v1.1.yml
@@ -64,3 +64,42 @@ $graph:
         The number of MPI processes to start. If you give a string,
         this will be evaluated as a CWL Expression and it must
         evaluate to an integer.
+
+- name: CUDARequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Require support for NVIDA CUDA (GPU hardware acceleration).
+  fields:
+    class:
+      type: string
+      doc: 'cwltool:CUDARequirement'
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    cudaVersionMin:
+      type: string
+      doc: |
+        Minimum CUDA version to run the software, in X.Y format.  This
+        corresponds to a CUDA SDK release.  When running directly on
+        the host (not in a container) the host must have a compatible
+        CUDA SDK (matching the exact version, or, starting with CUDA
+        11.3, matching major version).  When run in a container, the
+        container image should provide the CUDA runtime, and the host
+        driver is injected into the container.  In this case, because
+        CUDA drivers are backwards compatible, it is possible to
+        use an older SDK with a newer driver across major versions.
+
+        See https://docs.nvidia.com/deploy/cuda-compatibility/ for
+        details.
+    cudaComputeCapabilityMin:
+      type: string
+      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
+    deviceCountMin:
+      type: int?
+      default: 1
+      doc: Minimum number of GPU devices to request, default 1.
+    deviceCountMax:
+      type: int?
+      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
diff --git a/cwltool/extensions.yml b/cwltool/extensions.yml
@@ -174,3 +174,42 @@ $graph:
         The number of MPI processes to start. If you give a string,
         this will be evaluated as a CWL Expression and it must
         evaluate to an integer.
+
+- name: CUDARequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Require support for NVIDA CUDA (GPU hardware acceleration).
+  fields:
+    class:
+      type: string
+      doc: 'cwltool:CUDARequirement'
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    cudaVersionMin:
+      type: string
+      doc: |
+        Minimum CUDA version to run the software, in X.Y format.  This
+        corresponds to a CUDA SDK release.  When running directly on
+        the host (not in a container) the host must have a compatible
+        CUDA SDK (matching the exact version, or, starting with CUDA
+        11.3, matching major version).  When run in a container, the
+        container image should provide the CUDA runtime, and the host
+        driver is injected into the container.  In this case, because
+        CUDA drivers are backwards compatible, it is possible to
+        use an older SDK with a newer driver across major versions.
+
+        See https://docs.nvidia.com/deploy/cuda-compatibility/ for
+        details.
+    cudaComputeCapabilityMin:
+      type: string
+      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
+    deviceCountMin:
+      type: int?
+      default: 1
+      doc: Minimum number of GPU devices to request, default 1.
+    deviceCountMax:
+      type: int?
+      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
diff --git a/cwltool/job.py b/cwltool/job.py
@@ -42,6 +42,7 @@
 from . import env_to_stdout, run_job
 from .builder import Builder
 from .context import RuntimeContext
+from .cuda import cuda_check
 from .errors import UnsupportedRequirement, WorkflowException
 from .loghandler import _logger
 from .pathmapper import MapperEnt, PathMapper
@@ -174,6 +175,15 @@ def run(
         pass
 
     def _setup(self, runtimeContext: RuntimeContext) -> None:
+
+        cuda_req, _ = self.builder.get_requirement(
+            "http://commonwl.org/cwltool#CUDARequirement"
+        )
+        if cuda_req:
+            count = cuda_check(cuda_req)
+            if count == 0:
+                raise WorkflowException("Could not satisfy CUDARequirement")
+
         if not os.path.exists(self.outdir):
             os.makedirs(self.outdir)
 
diff --git a/cwltool/main.py b/cwltool/main.py
@@ -663,12 +663,14 @@ def setup_schema(
             ext11 = res.read().decode("utf-8")
         use_custom_schema("v1.0", "http://commonwl.org/cwltool", ext10)
         use_custom_schema("v1.1", "http://commonwl.org/cwltool", ext11)
+        use_custom_schema("v1.2", "http://commonwl.org/cwltool", ext11)
         use_custom_schema("v1.2.0-dev1", "http://commonwl.org/cwltool", ext11)
         use_custom_schema("v1.2.0-dev2", "http://commonwl.org/cwltool", ext11)
         use_custom_schema("v1.2.0-dev3", "http://commonwl.org/cwltool", ext11)
     else:
         use_standard_schema("v1.0")
         use_standard_schema("v1.1")
+        use_standard_schema("v1.2")
         use_standard_schema("v1.2.0-dev1")
         use_standard_schema("v1.2.0-dev2")
         use_standard_schema("v1.2.0-dev3")
diff --git a/cwltool/process.py b/cwltool/process.py
@@ -120,6 +120,7 @@ def filter(self, record: logging.LogRecord) -> bool:
     "http://commonwl.org/cwltool#NetworkAccess",
     "http://commonwl.org/cwltool#LoadListingRequirement",
     "http://commonwl.org/cwltool#InplaceUpdateRequirement",
+    "http://commonwl.org/cwltool#CUDARequirement",
 ]
 
 cwl_files = (
diff --git a/cwltool/singularity.py b/cwltool/singularity.py
@@ -13,6 +13,7 @@
 
 from .builder import Builder
 from .context import RuntimeContext
+from .cuda import cuda_check
 from .errors import WorkflowException
 from .job import ContainerCommandLineJob
 from .loghandler import _logger
@@ -434,6 +435,13 @@ def create_runtime(
         else:
             runtime.extend(["--net", "--network", "none"])
 
+        cuda_req, _ = self.builder.get_requirement(
+            "http://commonwl.org/cwltool#CUDARequirement"
+        )
+        if cuda_req:
+            # Checked earlier that the device count is non-zero in _setup
+            runtime.append("--nv")
+
         for name, value in self.environment.items():
             env[f"SINGULARITYENV_{name}"] = str(value)
 
diff --git a/tests/test_cuda.py b/tests/test_cuda.py
@@ -0,0 +1,50 @@
+from cwltool.cuda import cuda_version_and_device_count
+from cwltool.main import main
+from .util import (
+    get_data,
+    get_main_output,
+    get_tool_env,
+    needs_docker,
+    needs_singularity_3_or_newer,
+)
+
+import pytest
+
+cuda_version = cuda_version_and_device_count()
+
+
+@needs_docker
+@pytest.mark.skipif(
+    cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
+)
+def test_cuda_docker() -> None:
+    params = [
+        "--enable-ext",
+        get_data("tests/wf/nvidia-smi-container.cwl"),
+    ]
+    assert main(params) == 0
+
+
+@needs_singularity_3_or_newer
+@pytest.mark.skipif(
+    cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
+)
+def test_cuda_singularity() -> None:
+    params = [
+        "--enable-ext",
+        "--singularity",
+        get_data("tests/wf/nvidia-smi-container.cwl"),
+    ]
+    assert main(params) == 0
+
+
+@pytest.mark.skipif(
+    cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
+)
+def test_cuda_no_container() -> None:
+    params = [
+        "--enable-ext",
+        "--singularity",
+        get_data("tests/wf/nvidia-smi.cwl"),
+    ]
+    assert main(params) == 0
diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl
@@ -0,0 +1,15 @@
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  cwltool: "http://commonwl.org/cwltool#"
+requirements:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "1.0"
+    cudaComputeCapabilityMin: "1.0"
+  DockerRequirement:
+    dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
+inputs: []
+outputs: []
+# Assume this will exit non-zero (resulting in a failing test case) if
+# nvidia-smi doesn't detect any devices.
+baseCommand: "nvidia-smi"
diff --git a/tests/wf/nvidia-smi.cwl b/tests/wf/nvidia-smi.cwl
@@ -0,0 +1,13 @@
+cwlVersion: v1.2
+class: CommandLineTool
+$namespaces:
+  cwltool: "http://commonwl.org/cwltool#"
+requirements:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "1.0"
+    cudaComputeCapabilityMin: "1.0"
+inputs: []
+outputs: []
+# Assume this will exit non-zero (resulting in a failing test case) if
+# nvidia-smi doesn't detect any devices.
+baseCommand: "nvidia-smi"

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ def filter(self, record: logging.LogRecord) -> bool:`
`120`	`120`	`"http://commonwl.org/cwltool#NetworkAccess",`
`121`	`121`	`"http://commonwl.org/cwltool#LoadListingRequirement",`
`122`	`122`	`"http://commonwl.org/cwltool#InplaceUpdateRequirement",`
	`123`	`+ "http://commonwl.org/cwltool#CUDARequirement",`
`123`	`124`	`]`
`124`	`125`
`125`	`126`	`cwl_files = (`