Fix for jll (#1228)

wsmoses · avik-pal · Avik Pal · web-flow · commit 9f59836a60ef · 2025-05-01T12:05:29.000-05:00
* Fix for jll

* foxup

* chore: run formatter

* fix: pointer type for allowed_devices

* fix

* fix: gpu build

* Update Project.toml

* Update pipeline.yml

* Update .buildkite/pipeline.yml

* Update .buildkite/pipeline.yml

---------

Co-authored-by: Avik Pal &lt;avikpal@mit.edu&gt;
Co-authored-by: Avik Pal &lt;avikpal@iitk.ac.in&gt;
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -42,8 +42,8 @@ steps:
           cuda: "*"
         env:
           REACTANT_TEST_GROUP: "{{matrix.group}}"
-          CUDA_VISIBLE_DEVICES: 0
           JULIA_DEBUG: "Reactant,Reactant_jll"
+          CUDA_VISIBLE_DEVICES: 0
         if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 120
 
diff --git a/Project.toml b/Project.toml
@@ -87,7 +87,7 @@ PythonCall = "0.9"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.9"
-Reactant_jll = "0.0.155"
+Reactant_jll = "0.0.158"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
 Sockets = "1.10"
diff --git a/deps/ReactantExtra/API.cpp b/deps/ReactantExtra/API.cpp
@@ -413,9 +413,9 @@ extern "C" PjRtClient *MakeCPUClient(uint8_t asynchronous, int node_id) {
 
 // xla/python/xla.cc 390
 extern "C" PjRtClient *
-MakeGPUClient(int node_id, int num_nodes, int *allowed_devices,
-              int num_allowed_devices, double memory_fraction, bool preallocate,
-              const char *platform_name, const char **error,
+MakeGPUClient(int node_id, int num_nodes, int64_t *allowed_devices,
+              int64_t num_allowed_devices, double memory_fraction,
+              bool preallocate, const char *platform_name, const char **error,
               void *distributed_runtime_client) {
   GpuClientOptions options;
 
@@ -437,10 +437,15 @@ MakeGPUClient(int node_id, int num_nodes, int *allowed_devices,
   options.allocator_config.memory_fraction = memory_fraction;
   options.node_id = node_id;
   options.num_nodes = num_nodes;
-  options.allowed_devices =
-      allowed_devices ? std::set<int>(allowed_devices,
-                                      allowed_devices + num_allowed_devices)
-                      : std::optional<std::set<int>>();
+  if (allowed_devices) {
+    std::set<int> allowed_devices_set;
+    for (int i = 0; i < num_allowed_devices; i++) {
+      allowed_devices_set.insert(static_cast<int>(allowed_devices[i]));
+    }
+    options.allowed_devices = allowed_devices_set;
+  } else {
+    options.allowed_devices = std::optional<std::set<int>>();
+  }
   options.platform_name =
       platform_name ? std::string(platform_name) : std::optional<std::string>();
   // options.collectives = num_nodes;
@@ -1406,8 +1411,10 @@ ifrt_compile(ifrt::Client *client, MlirModule cmod, int64_t device_id,
       device_id, mesh_ids, num_mesh_ids, xla_gpu_cuda_data_dir,
       use_shardy_partitioner, num_replicas, num_partitions,
       use_spmd_partitioning);
+  xla::ifrt::DeviceListRef devices = MyValueOrThrow(
+      xla::ifrt::GetDeviceListFromXlaCompileOptions(client, compile_options));
   auto options = std::make_unique<xla::ifrt::XlaCompileOptions>(
-      xla::ifrt::XlaCompileOptions(compile_options));
+      compile_options, std::move(devices));
 
   mlir::ModuleOp cmod_op = cast<ModuleOp>(*unwrap(cmod));
   if (use_spmd_partitioning && use_shardy_partitioner) {
@@ -1635,10 +1642,12 @@ ifrt_make_pjrt_cpu_client(uint8_t asynchronous, int node_id, int num_nodes,
                                kv_store);
 }
 
-extern "C" ifrt::Client *ifrt_make_pjrt_gpu_client(
-    int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
-    double memory_fraction, bool preallocate, const char *platform_name,
-    const char **error, void *distributed_runtime_client) {
+extern "C" ifrt::Client *
+ifrt_make_pjrt_gpu_client(int node_id, int num_nodes, int64_t *allowed_devices,
+                          int64_t num_allowed_devices, double memory_fraction,
+                          bool preallocate, const char *platform_name,
+                          const char **error,
+                          void *distributed_runtime_client) {
   PjRtClient *pjrt_client = MakeGPUClient(
       node_id, num_nodes, allowed_devices, num_allowed_devices, memory_fraction,
       preallocate, platform_name, error, distributed_runtime_client);
@@ -2457,12 +2466,8 @@ extern "C" void ifrt_hlo_module_cost_analysis_properties(
 
 #pragma endregion
 
-extern "C" void dump_op(Operation *op) {
-  llvm::errs() << *op << "\n";
-}
-extern "C" void dump_mval(mlir::Value v) {
-  llvm::errs() << v << "\n";
-}
+extern "C" void dump_op(Operation *op) { llvm::errs() << *op << "\n"; }
+extern "C" void dump_mval(mlir::Value v) { llvm::errs() << v << "\n"; }
 extern "C" void dump_operation(Operation *op, const char *filename) {
   std::error_code EC;
   llvm::raw_fd_ostream file(filename, EC, llvm::sys::fs::OF_Text);
diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
@@ -900,7 +900,7 @@ cc_library(
 "-Wl,-exported_symbol,_addSdyPropagationPipeline",
     ]}),
     deps = [
-                "@enzyme//:EnzymeMLIR",
+        "@enzyme//:EnzymeMLIR",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:ArithDialect",
@@ -1025,8 +1025,11 @@ cc_library(
         "@jax//jaxlib/mosaic:tpu_dialect_capi_objects",
         "@jax//jaxlib/triton:triton_dialect_capi_objects",
         "@xla//xla/stream_executor/cuda:cuda_compute_capability_proto_cc_impl",
+        "@xla//xla/service:gpu_plugin",
+        "@xla//xla/pjrt/c:pjrt_c_api_gpu",
     ] + select({
         "@xla//xla/tsl:is_cuda_enabled_and_oss":[
+            "@xla//xla/stream_executor:cuda_platform",
             "@xla//xla/stream_executor/cuda:all_runtime",
             "@xla//xla/service/gpu/model:hlo_op_profiles",
             "@xla//xla/service/gpu/model:hlo_op_profile_proto_cc_impl",
@@ -1040,6 +1043,7 @@ cc_library(
         "//conditions:default": [
         ],
     }) + if_rocm([
+        "@xla//xla/stream_executor:rocm_platform",
         "@xla//xla/service/gpu:amdgpu_compiler",
         "@xla//xla/backends/profiler/gpu:device_tracer",
     ]) + select({
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -9,7 +9,7 @@ http_archive(
     urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)],
 )
 
-ENZYMEXLA_COMMIT = "fc12061c02f057da8cd22e7e7bb12e050eca3f60"
+ENZYMEXLA_COMMIT = "e1f2496cc251cc30bd2b155ad3133316617beca8"
 ENZYMEXLA_SHA256 = ""
 
 http_archive(
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -915,7 +915,7 @@ end
 
 # TODO we want to be able to run the more advanced passes via transform dialect as an enzyme intermediate
 # However, this errs as we cannot attach the transform with to the funcop itself [as we run a functionpass].
-const enzyme_pass::String = "enzyme{postpasses=\"canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,arith-raise{stablehlo=true},canonicalize,cse,canonicalize\"}"
+const enzyme_pass::String = "enzyme{postpasses=\"arith-raise{stablehlo=true},canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,cse,canonicalize\"}"
 
 function run_pass_pipeline!(mod, pass_pipeline, key=""; enable_verifier=true)
     pm = MLIR.IR.PassManager()
diff --git a/src/xla/IFRT/Client.jl b/src/xla/IFRT/Client.jl
@@ -177,8 +177,8 @@ function MakeIFRTPJRTGPUClient(;
         client = @ccall MLIR.API.mlir_c.ifrt_make_pjrt_gpu_client(
             node_id::Cint,
             num_nodes::Cint,
-            allowed_devices::Ptr{Cvoid},
-            num_allowed_devices::Cint,
+            allowed_devices::Ptr{Int64},
+            num_allowed_devices::Int64,
             XLA.XLA_REACTANT_GPU_MEM_FRACTION[]::Cdouble,
             XLA.XLA_REACTANT_GPU_PREALLOCATE[]::Bool,
             platform::Cstring,
diff --git a/src/xla/PJRT/Client.jl b/src/xla/PJRT/Client.jl
@@ -163,8 +163,8 @@ function MakeGPUClient(;
         client = @ccall MLIR.API.mlir_c.MakeGPUClient(
             node_id::Cint,
             num_nodes::Cint,
-            allowed_devices::Ptr{Cvoid},
-            num_allowed_devices::Cint,
+            allowed_devices::Ptr{Int64},
+            num_allowed_devices::Int64,
             XLA.XLA_REACTANT_GPU_MEM_FRACTION[]::Cdouble,
             XLA.XLA_REACTANT_GPU_PREALLOCATE[]::Bool,
             platform::Cstring,
diff --git a/test/autodiff.jl b/test/autodiff.jl
@@ -148,7 +148,7 @@ end
         (res.val ≈ 4ones(2, 2)) &&
             (res.derivs[1] ≈ 4ones(2, 2)) &&
             (res.derivs[2] ≈ 2ones(2, 2))
-    end broken = true
+    end
 end
 
 @testset "onehot" begin

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ http_archive(`
`9`	`9`	`urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)],`
`10`	`10`	`)`
`11`	`11`
`12`		`-ENZYMEXLA_COMMIT = "fc12061c02f057da8cd22e7e7bb12e050eca3f60"`
	`12`	`+ENZYMEXLA_COMMIT = "e1f2496cc251cc30bd2b155ad3133316617beca8"`
`13`	`13`	`ENZYMEXLA_SHA256 = ""`
`14`	`14`
`15`	`15`	`http_archive(`