@@ -431,7 +431,7 @@ def before_inline(a: ty.handle, c: ty.handle) -> None:
431431
432432 .. code-block:: python
433433
434- sch = tir.Schedule(before_inline, debug_mode=True )
434+ sch = tir.Schedule(before_inline)
435435 sch.compute_inline(sch.get_block("B"))
436436 print(tvm.script.asscript(sch.mod["main"]))
437437
@@ -491,7 +491,7 @@ def before_inline(a: ty.handle, c: ty.handle) -> None:
491491
492492 .. code-block:: python
493493
494- sch = tir.Schedule(before_inline, debug_mode=True )
494+ sch = tir.Schedule(before_inline)
495495 sch.reverse_compute_inline(sch.get_block("C"))
496496 print(tvm.script.asscript(sch.mod["main"]))
497497
@@ -512,6 +512,149 @@ def after_inline(a: ty.handle, c: ty.handle) -> None:
512512 ########## Schedule: loop binding/annotation ##########
513513 ########## Schedule: cache read/write ##########
514514 ########## Schedule: reduction ##########
515+ def rfactor (self , loop : LoopRV , factor_axis : int ) -> LoopRV :
516+ """Factorize an associative reduction block by the specified loop.
517+
518+ An associative reduction cannot be parallelized directly,
519+ because it leads to potential race condition during accumulation.
520+ Alternatively, the reduction could be factorized on a loop with the following steps:
521+ - Step 1: evenly slice the reduction into `n` separate chunks, where `n` is the loop extent
522+ - Step 2: compute the chunks separately and write the result into `n` intermediate buffers;
523+ - Step 3: accumulate the `n` separate buffer into the result buffer.
524+ Note that the Step 2 above introduces opportunities for parallelization.
525+
526+ RFactor is a schedule primitive that implements the transformation described above:
527+ Given a block that writes to buffer `B`, it factorizes a loop of extent `n`.
528+
529+ For example, the pesudocode below accumulates `B[i] = sum(A[i, : , : ])`:
530+
531+ .. code-block:: python
532+
533+ for i in range(128): # loop i is a data parallel loop
534+ for j in range(128): # loop j is a reduction loop
535+ for k in range(128): # loop k is a reduction loop
536+ B[i] = B[i] + A[i, j, k]
537+
538+ Suppose RFactor is applied on the innermost loop `k` and `factor_axis = 1`.
539+ RFactor then creates an intermediate buffer and two blocks.
540+
541+ 1. The intermediate buffer, or "rf-buffer" is a buffer of rank `ndim(B) + 1` and
542+ size `size(B) * n`, whose shape expands from `shape(B)` by adding an axis of `n`
543+ at the position specified by `factor_axis`. For example,
544+
545+ * shape(B) = [1, 2, 3], factor_axis = 0 => shape(B_rf) = [n, 1, 2, 3]
546+ * shape(B) = [1, 2, 3], factor_axis = 1 => shape(B_rf) = [1, n, 2, 3]
547+ * shape(B) = [1, 2, 3], factor_axis = 2 => shape(B_rf) = [1, 2, n, 3]
548+ * shape(B) = [1, 2, 3], factor_axis = 3 => shape(B_rf) = [1, 2, 3, n]
549+
550+ 2. The rfactor block, or "rf-block", is a block that writes to the `rf-buffer` without
551+ accumulating over the loop `k`, i.e. the loop `k` is converted from a reduction loop
552+ to a data parallel loop. In our example, the rf-block is:
553+
554+ .. code-block:: python
555+
556+ B_rf = np.zeros((128, 128)) # the rf-buffer
557+ for k in range(128): # loop k is converted to a data parallel loop
558+ for i in range(128): # loop i is a data parallel loop (unchanged)
559+ for j in range(128): # loop j is a reduction loop (unchanged)
560+ B_rf[i, k] = B_rf[i, k] + A[i, j, k]
561+
562+
563+ 3. The write-back block, or `wb-block`, is a block that accumulates the rf-buffer into
564+ the result buffer. All the reduction loops are removed except the loop `k` for accumulation.
565+ In our example, the wb-block is:
566+
567+ .. code-block:: python
568+
569+ for i in range(128): # loop i is a data parallel loop (unchanged)
570+ # loop j is removed because it is a reduction loop
571+ for k in range(128): # loop k is a reduction loop (unchanged)
572+ B[i] = B[i] + B_rf[i, k]
573+
574+
575+ Parameters
576+ ----------
577+ loop : LoopRV
578+ The loop outside block for which we want to do rfactor
579+ factor_axis : int
580+ The position where the new dimension is placed in the new introduced rfactor buffer
581+
582+ Returns
583+ -------
584+ rf_block : BlockRV
585+ The block which computes partial results over each slices (i.e., the first block
586+ as described in the above illustration)
587+
588+ Examples
589+ --------
590+
591+ Before rfactor, in TensorIR, the IR is:
592+
593+ .. code-block:: python
594+
595+ @tvm.script.tir
596+ def before_rfactor(a: ty.handle, b: ty.handle) -> None:
597+ A = tir.match_buffer(a, (128, 128, 128))
598+ B = tir.match_buffer(b, (128,))
599+ with tir.block([128, tir.reduce_axis(0, 128),
600+ tir.reduce_axis(0, 128)], "B") as [vii, vi, vj]:
601+ with tir.init():
602+ B[vii] = 0.0
603+ B[vii] = B[vii] + A[vii, vi, vj]
604+
605+ Create the schedule and do rfactor:
606+
607+ .. code-block:: python
608+
609+ sch = tir.Schedule(before_rfactor)
610+ _, _, k = sch.get_loops(sch.get_block("B"))
611+ sch.rfactor(k, 0)
612+ print(tvm.script.asscript(sch.mod["main"]))
613+
614+ After applying rfactor, the IR becomes:
615+
616+ .. code-block:: python
617+
618+ @tvm.script.tir
619+ def after_rfactor(a: ty.handle, b: ty.handle) -> None:
620+ A = tir.match_buffer(a, [128, 128, 128])
621+ B = tir.match_buffer(b, [128])
622+ B_rf = tir.alloc_buffer([128, 128])
623+ with tir.block([128, 128, tir.reduce_axis(0, 128)], "B_rf") as [vi2, vii, vi]:
624+ with tir.init():
625+ B_rf[vi2, vii] = 0.0
626+ B_rf[vi2, vii] = (B_rf[vi2, vii] + A[vii, vi, vi2])
627+ with tir.block([128, tir.reduce_axis(0, 128)], "B") as [vii_1, vi2_1]:
628+ with tir.init():
629+ B[vii_1] = 0.0
630+ B[vii_1] = (B[vii_1] + B_rf[vi2_1, vii_1])
631+
632+
633+ Note
634+ ----
635+
636+ Rfactor requires:
637+ 1) `loop` has only one child block, and it is a reduction block;
638+ 2) `loop` is a reduction loop, i.e. the loop variable is bound to only reduction variables
639+ in the block binding;
640+ 3) `loop` is not parallelized, vectorized, unrolled or bound to any thread axis;
641+ 4) The block scope that `loop` is in is a staged-pipeline;
642+ 5) The outermost loop outside the reduction block should has the reduction block as its
643+ first child block;
644+ 6) The outermost reduction loop should have only one child block;
645+ 7) An unary extent loop that is not bound to any reduction or data parallel variables in
646+ the block binding should not appear under some reduction loop;
647+ 8) The reduction block should write to only one buffer, and its init and body are both
648+ simple `BufferStore`s, and the pattern is registered as an associative reducer.
649+ The pre-defined patterns include: plus, multiplication, min and max;
650+ 9) Each of the loops on top of the block cannot be bound to a data parallel and a
651+ reduction block binding at the same time;
652+ 10) `factor_axis` should be in range `[-ndim(B) - 1, ndim(B)]`,
653+ where `B` is the buffer that the reduction block writes to.
654+ Negative indexing is normalized according to numpy convention.
655+ """
656+ return _ffi_api_schedule .ScheduleRFactor (self , loop , factor_axis ) # type: ignore # pylint: disable=no-member
657+
515658 ########## Schedule: blockize & tensorize ##########
516659
517660
0 commit comments