@@ -297,7 +297,7 @@ function ka_with_reactant(ndrange, workgroupsize, obj, args...)
297
297
298
298
# figure out the optimal workgroupsize automatically
299
299
if KA. workgroupsize (obj) <: KA.DynamicSize && workgroupsize === nothing
300
- if ! Reactant. Compiler. PartitionKA[]
300
+ if ! Reactant. Compiler. PartitionKA[] || Reactant . Compiler . Raise[]
301
301
threads = prod (ndrange)
302
302
else
303
303
config = CUDA. launch_configuration (kernel. fun; max_threads= prod (ndrange))
@@ -459,6 +459,145 @@ function vendored_optimize_module!(
459
459
end
460
460
end
461
461
462
+ function vendored_buildEarlyOptimizerPipeline (mpm, @nospecialize (job), opt_level; instcombine= false )
463
+ LLVM. add! (mpm, LLVM. NewPMCGSCCPassManager ()) do cgpm
464
+ # TODO invokeCGSCCCallbacks
465
+ LLVM. add! (cgpm, LLVM. NewPMFunctionPassManager ()) do fpm
466
+ LLVM. add! (fpm, LLVM. Interop. AllocOptPass ())
467
+ LLVM. add! (fpm, LLVM. Float2IntPass ())
468
+ LLVM. add! (fpm, LLVM. LowerConstantIntrinsicsPass ())
469
+ end
470
+ end
471
+ LLVM. add! (mpm, GPULowerCPUFeaturesPass ())
472
+ if opt_level >= 1
473
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
474
+ if opt_level >= 2
475
+ LLVM. add! (fpm, LLVM. SROAPass ())
476
+ if instcombine
477
+ LLVM. add! (fpm, LLVM. InstCombinePass ())
478
+ else
479
+ LLVM. add! (fpm, LLVM. InstSimplifyPass ())
480
+ end
481
+ LLVM. add! (fpm, LLVM. JumpThreadingPass ())
482
+ LLVM. add! (fpm, LLVM. CorrelatedValuePropagationPass ())
483
+ LLVM. add! (fpm, LLVM. ReassociatePass ())
484
+ LLVM. add! (fpm, LLVM. EarlyCSEPass ())
485
+ LLVM. add! (fpm, LLVM. Interop. AllocOptPass ())
486
+ else
487
+ if instcombine
488
+ LLVM. add! (fpm, LLVM. InstCombinePass ())
489
+ else
490
+ LLVM. add! (fpm, LLVM. InstSimplifyPass ())
491
+ end
492
+ LLVM. add! (fpm, LLVM. EarlyCSEPass ())
493
+ end
494
+ end
495
+ # TODO invokePeepholeCallbacks
496
+ end
497
+ end
498
+
499
+ function vendored_buildIntrinsicLoweringPipeline (mpm, @nospecialize (job), opt_level; instcombine:: Bool = false )
500
+ GPUCompiler. add! (mpm, LLVM. Interop. RemoveNIPass ())
501
+
502
+ # lower GC intrinsics
503
+ if ! GPUCompiler. uses_julia_runtime (job)
504
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
505
+ LLVM. add! (fpm, GPULowerGCFramePass ())
506
+ end
507
+ end
508
+
509
+ # lower kernel state intrinsics
510
+ # NOTE: we can only do so here, as GC lowering can introduce calls to the runtime,
511
+ # and thus additional uses of the kernel state intrinsics.
512
+ if job. config. kernel
513
+ # TODO : now that all kernel state-related passes are being run here, merge some?
514
+ LLVM. add! (mpm, AddKernelStatePass ())
515
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
516
+ LLVM. add! (fpm, LowerKernelStatePass ())
517
+ end
518
+ LLVM. add! (mpm, CleanupKernelStatePass ())
519
+ end
520
+
521
+ if ! GPUCompiler. uses_julia_runtime (job)
522
+ # remove dead uses of ptls
523
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
524
+ LLVM. add! (fpm, LLVM. ADCEPass ())
525
+ end
526
+ LLVM. add! (mpm, GPULowerPTLSPass ())
527
+ end
528
+
529
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
530
+ # lower exception handling
531
+ if GPUCompiler. uses_julia_runtime (job)
532
+ LLVM. add! (fpm, LLVM. Interop. LowerExcHandlersPass ())
533
+ end
534
+ LLVM. add! (fpm, GPUCompiler. GCInvariantVerifierPass ())
535
+ LLVM. add! (fpm, LLVM. Interop. LateLowerGCPass ())
536
+ if GPUCompiler. uses_julia_runtime (job) && VERSION >= v " 1.11.0-DEV.208"
537
+ LLVM. add! (fpm, LLVM. Interop. FinalLowerGCPass ())
538
+ end
539
+ end
540
+ if GPUCompiler. uses_julia_runtime (job) && VERSION < v " 1.11.0-DEV.208"
541
+ LLVM. add! (mpm, LLVM. Interop. FinalLowerGCPass ())
542
+ end
543
+
544
+ if opt_level >= 2
545
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
546
+ LLVM. add! (fpm, LLVM. GVNPass ())
547
+ LLVM. add! (fpm, LLVM. SCCPPass ())
548
+ LLVM. add! (fpm, LLVM. DCEPass ())
549
+ end
550
+ end
551
+
552
+ # lower PTLS intrinsics
553
+ if GPUCompiler. uses_julia_runtime (job)
554
+ LLVM. add! (mpm, LLVM. Interop. LowerPTLSPass ())
555
+ end
556
+
557
+ if opt_level >= 1
558
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
559
+ if instcombine
560
+ LLVM. add! (fpm, LLVM. InstCombinePass ())
561
+ else
562
+ LLVM. add! (fpm, LLVM. InstSimplifyPass ())
563
+ end
564
+ LLVM. add! (fpm, LLVM. SimplifyCFGPass (; GPUCompiler. AggressiveSimplifyCFGOptions... ))
565
+ end
566
+ end
567
+
568
+ # remove Julia address spaces
569
+ LLVM. add! (mpm, LLVM. Interop. RemoveJuliaAddrspacesPass ())
570
+
571
+ # Julia's operand bundles confuse the inliner, so repeat here now they are gone.
572
+ # FIXME : we should fix the inliner so that inlined code gets optimized early-on
573
+ LLVM. add! (mpm, LLVM. AlwaysInlinerPass ())
574
+ end
575
+
576
+ function vendored_buildNewPMPipeline! (mpm, @nospecialize (job), opt_level)
577
+ # Doesn't call instcombine
578
+ GPUCompiler. buildEarlySimplificationPipeline (mpm, job, opt_level)
579
+ LLVM. add! (mpm, LLVM. AlwaysInlinerPass ())
580
+ vendored_buildEarlyOptimizerPipeline (mpm, job, opt_level)
581
+ LLVM. add! (mpm, LLVM. NewPMFunctionPassManager ()) do fpm
582
+ # Doesn't call instcombine
583
+ GPUCompiler. buildLoopOptimizerPipeline (fpm, job, opt_level)
584
+ # Doesn't call instcombine
585
+ GPUCompiler. buildScalarOptimizerPipeline (fpm, job, opt_level)
586
+ if GPUCompiler. uses_julia_runtime (job) && opt_level >= 2
587
+ # XXX : we disable vectorization, as this generally isn't useful for GPU targets
588
+ # and actually causes issues with some back-end compilers (like Metal).
589
+ # TODO : Make this not dependent on `uses_julia_runtime` (likely CPU), but it's own control
590
+ # Doesn't call instcombine
591
+ GPUCompiler. buildVectorPipeline (fpm, job, opt_level)
592
+ end
593
+ # if isdebug(:optim)
594
+ # add!(fpm, WarnMissedTransformationsPass())
595
+ # end
596
+ end
597
+ vendored_buildIntrinsicLoweringPipeline (mpm, job, opt_level)
598
+ GPUCompiler. buildCleanupPipeline (mpm, job, opt_level)
599
+ end
600
+
462
601
# compile to executable machine code
463
602
function compile (job)
464
603
# lower to PTX
@@ -495,11 +634,17 @@ function compile(job)
495
634
LLVM. register! (pb, CleanupKernelStatePass ())
496
635
497
636
LLVM. add! (pb, LLVM. NewPMModulePassManager ()) do mpm
498
- GPUCompiler . buildNewPMPipeline ! (mpm, job, opt_level)
637
+ vendored_buildNewPMPipeline ! (mpm, job, opt_level)
499
638
end
500
639
LLVM. run! (pb, mod, tm)
501
640
end
641
+ if Reactant. Compiler. DUMP_LLVMIR[]
642
+ println (" cuda.jl pre vendor IR\n " , string (mod))
643
+ end
502
644
vendored_optimize_module! (job, mod)
645
+ if Reactant. Compiler. DUMP_LLVMIR[]
646
+ println (" cuda.jl post vendor IR\n " , string (mod))
647
+ end
503
648
LLVM. run! (CUDA. GPUCompiler. DeadArgumentEliminationPass (), mod, tm)
504
649
505
650
for fname in (" gpu_report_exception" , " gpu_signal_exception" )
0 commit comments