Skip to content

Commit cbc79e0

Browse files
authored
Initial support CUDARequirement extension (#1581)
Add CUDARequirement extension, standards track for future CWL spec. Checks for presence of CUDA using nvidia-smi. Includes support using Docker (--gpus) and Singularity (--nv).
1 parent 62fe629 commit cbc79e0

File tree

11 files changed

+230
-0
lines changed

11 files changed

+230
-0
lines changed

cwltool/cuda.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import subprocess # nosec
2+
import xml.dom.minidom # nosec
3+
from .loghandler import _logger
4+
from .utils import CWLObjectType
5+
6+
from typing import Tuple, cast
7+
8+
9+
def cuda_version_and_device_count() -> Tuple[str, int]:
10+
try:
11+
out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec
12+
except Exception as e:
13+
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
14+
return ("", 0)
15+
dm = xml.dom.minidom.parseString(out) # nosec
16+
ag = dm.getElementsByTagName("attached_gpus")[0].firstChild
17+
cv = dm.getElementsByTagName("cuda_version")[0].firstChild
18+
return (cv.data, int(ag.data))
19+
20+
21+
def cuda_check(cuda_req: CWLObjectType) -> int:
22+
try:
23+
vmin = float(str(cuda_req["cudaVersionMin"]))
24+
version, devices = cuda_version_and_device_count()
25+
if version == "":
26+
# nvidia-smi not detected, or failed some other way
27+
return 0
28+
versionf = float(version)
29+
if versionf < vmin:
30+
_logger.warning(
31+
"CUDA version '%s' is less than minimum version '%s'", version, vmin
32+
)
33+
return 0
34+
dmin = cast(int, cuda_req.get("deviceCountMin", 1))
35+
dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
36+
if devices < dmin:
37+
_logger.warning(
38+
"Requested at least %d GPU devices but only %d available", dmin, devices
39+
)
40+
return 0
41+
return min(dmax, devices)
42+
except Exception as e:
43+
_logger.warning("Error checking CUDA requirements: %s", e)
44+
return 0

cwltool/docker.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from .builder import Builder
1717
from .context import RuntimeContext
18+
from .cuda import cuda_check
1819
from .docker_id import docker_vm_id
1920
from .errors import WorkflowException
2021
from .job import ContainerCommandLineJob
@@ -395,6 +396,14 @@ def create_runtime(
395396
if runtimeContext.rm_container:
396397
runtime.append("--rm")
397398

399+
cuda_req, _ = self.builder.get_requirement(
400+
"http://commonwl.org/cwltool#CUDARequirement"
401+
)
402+
if cuda_req:
403+
# Checked earlier that the device count is non-zero in _setup
404+
count = cuda_check(cuda_req)
405+
runtime.append("--gpus=" + str(count))
406+
398407
cidfile_path = None # type: Optional[str]
399408
# add parameters to docker to write a container ID file
400409
if runtimeContext.user_space_docker_cmd is None:

cwltool/extensions-v1.1.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,42 @@ $graph:
6464
The number of MPI processes to start. If you give a string,
6565
this will be evaluated as a CWL Expression and it must
6666
evaluate to an integer.
67+
68+
- name: CUDARequirement
69+
type: record
70+
extends: cwl:ProcessRequirement
71+
inVocab: false
72+
doc: |
73+
Require support for NVIDA CUDA (GPU hardware acceleration).
74+
fields:
75+
class:
76+
type: string
77+
doc: 'cwltool:CUDARequirement'
78+
jsonldPredicate:
79+
_id: "@type"
80+
_type: "@vocab"
81+
cudaVersionMin:
82+
type: string
83+
doc: |
84+
Minimum CUDA version to run the software, in X.Y format. This
85+
corresponds to a CUDA SDK release. When running directly on
86+
the host (not in a container) the host must have a compatible
87+
CUDA SDK (matching the exact version, or, starting with CUDA
88+
11.3, matching major version). When run in a container, the
89+
container image should provide the CUDA runtime, and the host
90+
driver is injected into the container. In this case, because
91+
CUDA drivers are backwards compatible, it is possible to
92+
use an older SDK with a newer driver across major versions.
93+
94+
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
95+
details.
96+
cudaComputeCapabilityMin:
97+
type: string
98+
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
99+
deviceCountMin:
100+
type: int?
101+
default: 1
102+
doc: Minimum number of GPU devices to request, default 1.
103+
deviceCountMax:
104+
type: int?
105+
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.

cwltool/extensions.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,42 @@ $graph:
174174
The number of MPI processes to start. If you give a string,
175175
this will be evaluated as a CWL Expression and it must
176176
evaluate to an integer.
177+
178+
- name: CUDARequirement
179+
type: record
180+
extends: cwl:ProcessRequirement
181+
inVocab: false
182+
doc: |
183+
Require support for NVIDA CUDA (GPU hardware acceleration).
184+
fields:
185+
class:
186+
type: string
187+
doc: 'cwltool:CUDARequirement'
188+
jsonldPredicate:
189+
_id: "@type"
190+
_type: "@vocab"
191+
cudaVersionMin:
192+
type: string
193+
doc: |
194+
Minimum CUDA version to run the software, in X.Y format. This
195+
corresponds to a CUDA SDK release. When running directly on
196+
the host (not in a container) the host must have a compatible
197+
CUDA SDK (matching the exact version, or, starting with CUDA
198+
11.3, matching major version). When run in a container, the
199+
container image should provide the CUDA runtime, and the host
200+
driver is injected into the container. In this case, because
201+
CUDA drivers are backwards compatible, it is possible to
202+
use an older SDK with a newer driver across major versions.
203+
204+
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
205+
details.
206+
cudaComputeCapabilityMin:
207+
type: string
208+
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
209+
deviceCountMin:
210+
type: int?
211+
default: 1
212+
doc: Minimum number of GPU devices to request, default 1.
213+
deviceCountMax:
214+
type: int?
215+
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.

cwltool/job.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from . import env_to_stdout, run_job
4343
from .builder import Builder
4444
from .context import RuntimeContext
45+
from .cuda import cuda_check
4546
from .errors import UnsupportedRequirement, WorkflowException
4647
from .loghandler import _logger
4748
from .pathmapper import MapperEnt, PathMapper
@@ -174,6 +175,15 @@ def run(
174175
pass
175176

176177
def _setup(self, runtimeContext: RuntimeContext) -> None:
178+
179+
cuda_req, _ = self.builder.get_requirement(
180+
"http://commonwl.org/cwltool#CUDARequirement"
181+
)
182+
if cuda_req:
183+
count = cuda_check(cuda_req)
184+
if count == 0:
185+
raise WorkflowException("Could not satisfy CUDARequirement")
186+
177187
if not os.path.exists(self.outdir):
178188
os.makedirs(self.outdir)
179189

cwltool/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,14 @@ def setup_schema(
663663
ext11 = res.read().decode("utf-8")
664664
use_custom_schema("v1.0", "http://commonwl.org/cwltool", ext10)
665665
use_custom_schema("v1.1", "http://commonwl.org/cwltool", ext11)
666+
use_custom_schema("v1.2", "http://commonwl.org/cwltool", ext11)
666667
use_custom_schema("v1.2.0-dev1", "http://commonwl.org/cwltool", ext11)
667668
use_custom_schema("v1.2.0-dev2", "http://commonwl.org/cwltool", ext11)
668669
use_custom_schema("v1.2.0-dev3", "http://commonwl.org/cwltool", ext11)
669670
else:
670671
use_standard_schema("v1.0")
671672
use_standard_schema("v1.1")
673+
use_standard_schema("v1.2")
672674
use_standard_schema("v1.2.0-dev1")
673675
use_standard_schema("v1.2.0-dev2")
674676
use_standard_schema("v1.2.0-dev3")

cwltool/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def filter(self, record: logging.LogRecord) -> bool:
120120
"http://commonwl.org/cwltool#NetworkAccess",
121121
"http://commonwl.org/cwltool#LoadListingRequirement",
122122
"http://commonwl.org/cwltool#InplaceUpdateRequirement",
123+
"http://commonwl.org/cwltool#CUDARequirement",
123124
]
124125

125126
cwl_files = (

cwltool/singularity.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from .builder import Builder
1515
from .context import RuntimeContext
16+
from .cuda import cuda_check
1617
from .errors import WorkflowException
1718
from .job import ContainerCommandLineJob
1819
from .loghandler import _logger
@@ -434,6 +435,13 @@ def create_runtime(
434435
else:
435436
runtime.extend(["--net", "--network", "none"])
436437

438+
cuda_req, _ = self.builder.get_requirement(
439+
"http://commonwl.org/cwltool#CUDARequirement"
440+
)
441+
if cuda_req:
442+
# Checked earlier that the device count is non-zero in _setup
443+
runtime.append("--nv")
444+
437445
for name, value in self.environment.items():
438446
env[f"SINGULARITYENV_{name}"] = str(value)
439447

tests/test_cuda.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from cwltool.cuda import cuda_version_and_device_count
2+
from cwltool.main import main
3+
from .util import (
4+
get_data,
5+
get_main_output,
6+
get_tool_env,
7+
needs_docker,
8+
needs_singularity_3_or_newer,
9+
)
10+
11+
import pytest
12+
13+
cuda_version = cuda_version_and_device_count()
14+
15+
16+
@needs_docker
17+
@pytest.mark.skipif(
18+
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
19+
)
20+
def test_cuda_docker() -> None:
21+
params = [
22+
"--enable-ext",
23+
get_data("tests/wf/nvidia-smi-container.cwl"),
24+
]
25+
assert main(params) == 0
26+
27+
28+
@needs_singularity_3_or_newer
29+
@pytest.mark.skipif(
30+
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
31+
)
32+
def test_cuda_singularity() -> None:
33+
params = [
34+
"--enable-ext",
35+
"--singularity",
36+
get_data("tests/wf/nvidia-smi-container.cwl"),
37+
]
38+
assert main(params) == 0
39+
40+
41+
@pytest.mark.skipif(
42+
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
43+
)
44+
def test_cuda_no_container() -> None:
45+
params = [
46+
"--enable-ext",
47+
"--singularity",
48+
get_data("tests/wf/nvidia-smi.cwl"),
49+
]
50+
assert main(params) == 0

tests/wf/nvidia-smi-container.cwl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
cwlVersion: v1.2
2+
class: CommandLineTool
3+
$namespaces:
4+
cwltool: "http://commonwl.org/cwltool#"
5+
requirements:
6+
cwltool:CUDARequirement:
7+
cudaVersionMin: "1.0"
8+
cudaComputeCapabilityMin: "1.0"
9+
DockerRequirement:
10+
dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
11+
inputs: []
12+
outputs: []
13+
# Assume this will exit non-zero (resulting in a failing test case) if
14+
# nvidia-smi doesn't detect any devices.
15+
baseCommand: "nvidia-smi"

0 commit comments

Comments
 (0)