From 12332746073986e2c12bf12c368d53c0fc27d1e5 Mon Sep 17 00:00:00 2001
From: Pedro Freire <pfr@fb.com>
Date: Mon, 4 Nov 2019 15:29:57 +0100
Subject: [PATCH 1/3] Simlify and organize test_ops.

We perform the following:

- Simplify the functions slow_roi_pooling, slow_ps_roi_pooling, slow_ps_roi_align and bilinear_interpolate (including finding and removing a semi-bug in slow_ps_roi_pooling, which used bin_w instead of bin_h);
- Wrote a slow_roi_align function, that was missing;
- Create a base class testing all combinations of forward/backward, cpu/cuda, contiguous/non-contiguous;
- Organize all testing inside the base class with _test_forward and _test_backward (which can be easily overriden if a parciular op needs something different); an Op class then only needs to implement fn, get_script_fn, and expected_fn.

A few points:
- We are using the same inputs for all tests, and not trying all possible inputs in the domain of a given operation. One improvement would be to test more diverse inputs, and to personalize the inputs for some ops (e.g. different inputs for pooling ops and align ops).
- Running all tests is quite slow (~1 min only for CPU tests), so that can possibly be improved.
---
 test/test_ops.py | 1315 ++++++++--------------------------------------
 1 file changed, 226 insertions(+), 1089 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index f4440869896..3eb9e9d78f6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,3 +1,4 @@
+from __future__ import division
 import numpy as np
 import torch
 from torch.autograd import gradcheck
@@ -7,1169 +8,305 @@
 from itertools import product
 import unittest
 
+from collections import namedtuple
+Example = namedtuple('Example', ['x', 'rois'])
 
-class RoIPoolTester(unittest.TestCase):
+
+class RoIOpTester:
     @classmethod
     def setUpClass(cls):
         cls.dtype = torch.float64
 
-    def slow_roi_pooling(self, x, rois, pool_h, pool_w, spatial_scale=1,
-                         device=None, dtype=torch.float64):
-        if device is None:
-            device = torch.device("cpu")
-        c = x.size(1)
-        y = torch.zeros(rois.size(0), c, pool_h, pool_w, dtype=dtype, device=device)
-
-        rois = torch.round(rois * spatial_scale)
-
-        for n in range(0, y.size(0)):
-            for r, roi in enumerate(rois):
-                if roi[0] == n:
-                    start_h, end_h = int(roi[2].item()), int(roi[4].item()) + 1
-                    start_w, end_w = int(roi[1].item()), int(roi[3].item()) + 1
-                    roi_x = x[roi[0].long(), :, start_h:end_h, start_w:end_w]
-                    bin_h, bin_w = roi_x.size(-2) / float(pool_h), roi_x.size(-1) / float(pool_w)
-
-                    for j in range(0, pool_h):
-                        cj = slice(int(np.floor(j * bin_h)), int(np.ceil((j + 1) * bin_h)))
-                        for i in range(0, pool_w):
-                            ci = slice(int(np.floor(i * bin_w)), int(np.ceil((i + 1) * bin_w)))
-                            t = roi_x[:, cj, ci].reshape(c, -1)
-                            if t.numel() > 0:
-                                y[r, :, j, i] = torch.max(t, 1)[0]
-        return y
-
-    def test_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
-
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU')
-
-    def test_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
+    def test_forward_cpu_contiguous(self):
+        self._test_forward(device=torch.device('cpu'), contiguous=True)
 
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for batch > 1')
-
-    def test_roi_pool_cpu_empty_rois(self):
-        device = torch.device('cpu')
-        x = torch.tensor(
-            [[[[0.1767, 1.2851, 4.2325, 4.8645, 7.1496]],
-              [[2.5916, 4.3361, 3.8143, 6.1329, 2.0230]],
-              [[1.4492, 3.3384, 4.0816, 6.3116, 5.1068]]]],
-            dtype=self.dtype, device=device)
-        rois = torch.tensor(
-            [[0., 1., 0., 4., 0.],
-             [0., 2., 0., 3., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 0., 0., 0., 0.],
-             [0., 2., 0., 2., 0.]],
-            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (1, 2)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU empty rois')
-
-        # non-contiguous
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'RoIPool layer incorrect on CPU for empty rois non-contiguous')
-
-    def test_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
-            dtype=self.dtype, device=device)
-
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
-
-    def test_roi_pool_align_non_cont_grad_cpu(self):
-        devices = ['cpu']
-        if torch.cuda.is_available():
-            devices.append('cuda')
-
-        for d in devices:
-            device = torch.device(d)
-            rois = torch.tensor([
-                [0, 0, 0, 9, 9],
-                [0, 0, 5, 5, 9],
-                [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-            grad_cont = torch.rand(3, 1, 5, 5, dtype=self.dtype, device=device)
-            grad = grad_cont.permute(2, 1, 3, 0).contiguous().permute(3, 1, 0, 2)
-
-            for op in ['RoIPool', 'RoIAlign']:
-                x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-                kwargs = {}
-                if op == 'RoIAlign':
-                    kwargs['sampling_ratio'] = 1
-                m = getattr(ops, op)((5, 5), 1, **kwargs)
-
-                y = m(x, rois)
-                y.backward(grad_cont)
-
-                g1 = x.grad.detach().clone()
-                del x.grad
-
-                y = m(x, rois)
-                y.backward(grad)
-
-                g2 = x.grad.detach().clone()
-                del x.grad
-                self.assertTrue(torch.allclose(g1, g2), 'gradient incorrect for {}'.format(op))
-
-    def test_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CPU')
+    def test_forward_cpu_non_contiguous(self):
+        self._test_forward(device=torch.device('cpu'), contiguous=False)
 
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
+    def test_backward_cpu_contiguous(self):
+        self._test_backward(device=torch.device('cpu'), contiguous=True)
 
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_pool')
+    def test_backward_cpu_non_contiguous(self):
+        self._test_backward(device=torch.device('cpu'), contiguous=False)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_basic_cuda(self):
-        device = torch.device('cuda')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
-
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    def test_forward_cuda_contiguous(self):
+        self._test_forward(device=torch.device('cuda'), contiguous=True)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        x = torch.rand(2, 1, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (5, 5)
-        roi_pool = ops.RoIPool((pool_h, pool_w), 1)
-        y = roi_pool(x, rois)
-
-        gt_y = self.slow_roi_pooling(x, rois, pool_h, pool_w, device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
-
-        y = roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'RoIPool layer incorrect')
+    def test_forward_cuda_non_contiguous(self):
+        self._test_forward(device=torch.device('cuda'), contiguous=False)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_gradient_cuda(self):
-        device = torch.device('cuda')
-        layer = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 4, 9],
-            [0, 0, 0, 4, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
-                                  [2., 1., 2., 1., 2., 0., 1., 0., 1., 0.],
-                                  [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]]]],
-                               device=device, dtype=self.dtype)
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for roi_pool')
+    def test_backward_cuda_contiguous(self):
+        self._test_backward(device=torch.device('cuda'), contiguous=True)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        x = torch.rand(1, 1, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.RoIPool((5, 5), 1).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for roi_pool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for roi_pool CUDA')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted roi_pool on CUDA')
-
-
-class RoIAlignTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(123)
-        cls.dtype = torch.float32
-        cls.x = torch.rand(1, 1, 10, 10, dtype=cls.dtype)
-        cls.single_roi = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                                      dtype=cls.dtype)
-        cls.rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                                 [0, 0, 5, 4, 9],
-                                 [0, 5, 5, 9, 9]],
-                                dtype=cls.dtype)
-
-        cls.gt_y_single = torch.tensor(
-            [[[[0.41617328, 0.5040753, 0.25266218, 0.4296828, 0.29928464],
-               [0.5210769, 0.57222337, 0.2524979, 0.32063985, 0.32635176],
-               [0.73108256, 0.6114335, 0.62033176, 0.8188273, 0.5562218],
-               [0.83115816, 0.70803946, 0.7084047, 0.74928707, 0.7769296],
-               [0.54266506, 0.45964524, 0.5780159, 0.80522037, 0.7321807]]]], dtype=cls.dtype)
-
-        cls.gt_y_multiple = torch.tensor(
-            [[[[0.49311584, 0.35972416, 0.40843594, 0.3638034, 0.49751836],
-               [0.70881474, 0.75481665, 0.5826779, 0.34767765, 0.46865487],
-               [0.4740328, 0.69306874, 0.3617804, 0.47145438, 0.66130304],
-               [0.6861706, 0.17634538, 0.47194335, 0.42473823, 0.37930614],
-               [0.62666404, 0.49973848, 0.37911576, 0.5842756, 0.7176864]]],
-             [[[0.67499936, 0.6607055, 0.42656037, 0.46134934, 0.42144877],
-               [0.7471722, 0.7235433, 0.14512213, 0.13031253, 0.289369],
-               [0.8443615, 0.6659734, 0.23614208, 0.14719573, 0.4268827],
-               [0.69429564, 0.5621515, 0.5019923, 0.40678093, 0.34556213],
-               [0.51315194, 0.7177093, 0.6494485, 0.6775592, 0.43865064]]],
-             [[[0.24465509, 0.36108392, 0.64635646, 0.4051828, 0.33956185],
-               [0.49006107, 0.42982674, 0.34184104, 0.15493104, 0.49633422],
-               [0.54400194, 0.5265246, 0.22381854, 0.3929715, 0.6757667],
-               [0.32961223, 0.38482672, 0.68877804, 0.71822757, 0.711909],
-               [0.561259, 0.71047884, 0.84651315, 0.8541089, 0.644432]]]], dtype=cls.dtype)
-
-        cls.x_grad = torch.tensor(
-            [[[[0.075625, 0.15125, 0.15124999, 0.15125002, 0.15812504,
-                0.15812503, 0.15124999, 0.15124999, 0.15125006, 0.0756249],
-               [0.15125, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.31625003, 0.3025, 0.3025, 0.30250013, 0.1512498],
-               [0.15124999, 0.3025, 0.30249995, 0.3025, 0.31625006,
-                0.31625, 0.30249995, 0.30249995, 0.30250007, 0.15124978],
-               [0.15125002, 0.30250007, 0.3025, 0.30250007, 0.31625012,
-                0.3162501, 0.3025, 0.3025, 0.30250013, 0.15124981],
-               [0.15812504, 0.31625012, 0.31625006, 0.31625012, 0.33062524,
-                0.3306251, 0.31625006, 0.31625006, 0.3162502, 0.15812483],
-               [0.5181251, 1.0962502, 1.0362502, 1.0962503, 0.69062525, 0.6906252,
-                1.0962502, 1.0362502, 1.0962503, 0.5181248],
-               [0.93125, 1.9925, 1.8624997, 1.9925, 1.0962502, 1.0962502,
-                1.9925, 1.8624998, 1.9925, 0.9312496],
-               [0.8712501, 1.8625, 1.7425002, 1.8625001, 1.0362502, 1.0362502,
-                1.8625, 1.7425001, 1.8625002, 0.8712497],
-               [0.93125004, 1.9925, 1.8625002, 1.9925, 1.0962503, 1.0962503,
-                1.9925001, 1.8625001, 1.9925001, 0.93124974],
-               [0.43562484, 0.9312497, 0.8712497, 0.9312497, 0.5181249, 0.5181248,
-                0.9312496, 0.8712497, 0.93124974, 0.43562466]]]], dtype=cls.dtype)
-
-    def test_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CPU')
-
-    def test_roi_align_cpu(self):
-        device = torch.device('cpu')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CPU')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        single_roi = self.single_roi.to(device)
-        gt_y_single = self.gt_y_single.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, single_roi)
-
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), single_roi)
-        self.assertTrue(torch.allclose(gt_y_single, y), 'RoIAlign layer incorrect for single ROI on CUDA')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_cuda(self):
-        device = torch.device('cuda')
-        x = self.x.to(device)
-        rois = self.rois.to(device)
-        gt_y_multiple = self.gt_y_multiple.to(device)
-
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-        y = roi_align(x, rois)
-
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
-
-        y = roi_align(x.transpose(2, 3).contiguous().transpose(2, 3), rois)
-        self.assertTrue(torch.allclose(gt_y_multiple, y), 'RoIAlign layer incorrect for multiple ROIs on CUDA')
-
-    def test_roi_align_gradient_cpu(self):
-        """
-        Compute gradients for RoIAlign with multiple bounding boxes on CPU
-        """
-        device = torch.device('cpu')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-
-        x = self.x.to(device).clone()
-        rois = self.rois.to(device)
-        gt_grad = self.x_grad.to(device)
-
-        x.requires_grad = True
-        y = roi_align(x, rois)
-        s = y.sum()
-        s.backward()
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CPU')
-
-    def test_roi_align_gradcheck_cpu(self):
-        dtype = torch.float64
-        device = torch.device('cpu')
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        rois = self.rois.to(device=device, dtype=dtype)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CPU')
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CPU')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)), 'gradcheck failed for scripted roi_align')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_gradient_cuda(self):
-        """
-        Compute gradients for RoIAlign with multiple bounding boxes on the GPU
-        """
-        device = torch.device('cuda')
-        pool_h, pool_w = (5, 5)
-        roi_align = ops.RoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device)
-
-        x = self.x.to(device).clone()
-        rois = self.rois.to(device)
-        gt_grad = self.x_grad.to(device)
-
-        x.requires_grad = True
-        y = roi_align(x, rois)
-        s = y.sum()
-        s.backward()
-
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for RoIAlign CUDA')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_roi_align_gradcheck_cuda(self):
-        dtype = torch.float64
-        device = torch.device('cuda')
-        m = ops.RoIAlign((5, 5), 0.5, 1).to(dtype=dtype, device=device)
-        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        rois = self.rois.to(device=device, dtype=dtype)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for RoIAlign CUDA')
-        self.assertTrue(gradcheck(func, (x.transpose(2, 3),)), 'gradcheck failed for RoIAlign CUDA')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_align(input, rois, 5, 0.5, 1)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted roi_align on CUDA')
-
-
-def bilinear_interpolate(data, height, width, y, x):
-    if y < -1.0 or y > height or x < -1.0 or x > width:
-        return 0.
-
-    if y <= 0:
-        y = 0.
-    if x <= 0:
-        x = 0.
-
-    y_low, x_low = int(y), int(x)
-    y_high, x_high = 0, 0
-
-    if y_low >= height - 1:
-        y_high = y_low = height - 1
-        y = float(y_low)
-    else:
-        y_high = y_low + 1
-
-    if x_low >= width - 1:
-        x_high = x_low = width - 1
-        x = float(x_low)
-    else:
-        x_high = x_low + 1
-
-    ly = y - y_low
-    lx = x - x_low
-    hy, hx = 1. - ly, 1. - lx
-
-    v1 = data[y_low * width + x_low]
-    v2 = data[y_low * width + x_high]
-    v3 = data[y_high * width + x_low]
-    v4 = data[y_high * width + x_high]
-    w1, w2, w3, w4 = hy * hx, hy * lx, ly * hx, ly * lx
-
-    return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-
-
-class PSRoIAlignTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dtype = torch.float64
-
-    def slow_ps_roi_align(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
-                          sampling_ratio=-1, dtype=torch.float64):
-        if device is None:
-            device = torch.device("cpu")
-        num_input_channels = in_data.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
-        out_data = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
-
-        for n in range(0, in_data.size(0)):
-            for r, roi in enumerate(rois):
-                if roi[0] != n:
-                    continue
-                roi[1:] = (roi[1:] * spatial_scale) - 0.5
-                c_in = 0
-                roi_height = float(roi[4].item() - roi[2].item())
-                roi_width = float(roi[3].item() - roi[1].item())
-                bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
-                for c_out in range(0, num_output_channels):
-                    for j in range(0, pool_h):
-                        start_h = float(j) * bin_h + roi[2].item()
-
-                        for i in range(0, pool_w):
-                            start_w = float(i) * bin_w + roi[1].item()
-
-                            roi_bin_grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_height / pool_h))
-                            roi_bin_grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(roi_width / pool_w))
-
-                            val = 0.
-                            for iy in range(0, roi_bin_grid_h):
-                                y = start_h + (iy + 0.5) * bin_h / float(roi_bin_grid_h)
-                                for ix in range(0, roi_bin_grid_w):
-                                    x = start_w + (ix + 0.5) * bin_w / float(roi_bin_grid_w)
-                                    val += bilinear_interpolate(
-                                        in_data[n, c_in, :, :].flatten(),
-                                        in_data.size(-2),
-                                        in_data.size(-1),
-                                        y, x
-                                    )
-                            count = roi_bin_grid_h * roi_bin_grid_w
-                            out_data[r, c_out, j, i] = val / count
-                            c_in += 1
-        return out_data
-
-    def test_ps_roi_align_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
-
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
+    def test_backward_cuda_non_contiguous(self):
+        self._test_backward(device=torch.device('cuda'), contiguous=False)
 
-    def test_ps_roi_align_cpu(self):
-        device = torch.device('cpu')
+    def _test_forward(self, device, contiguous):
         pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
+        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10, dtype=self.dtype, device=device)
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
         rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
                              [0, 0, 5, 4, 9],
                              [0, 5, 5, 9, 9],
                              [1, 0, 0, 9, 9]],
                             dtype=self.dtype, device=device)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
-
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIAlign layer incorrect on CPU')
-
-    def test_ps_roi_align_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign on CPU')
-
-    def test_ps_roi_align_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
-
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
-
-        def func(input):
-            return m(input, rois)
-
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign on CPU')
-
-        @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_align on CPU')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, 2 * (pool_size ** 2), 7, 7, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 5, 5]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
+        pool_h, pool_w = pool_size, pool_size
+        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1)
+        gt_y = self.expected_fn(x, rois, pool_h, pool_w, spatial_scale=1,
+                                sampling_ratio=-1, device=device, dtype=self.dtype)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
+        self.assertTrue(torch.allclose(gt_y, y))
 
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=-1,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    def _test_backward(self, device, contiguous):
         pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
+        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device, requires_grad=True)
+        if not contiguous:
+            x = x.permute(0, 1, 3, 2)
         rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
                              [0, 0, 5, 4, 9],
                              [0, 5, 5, 9, 9],
                              [1, 0, 0, 9, 9]],
                             dtype=self.dtype, device=device)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_align = ops.PSRoIAlign((pool_h, pool_w), spatial_scale=1, sampling_ratio=2)
-        y = ps_roi_align(x, rois)
+        def func(z):
+            return self.fn(z, rois, 5, 5, spatial_scale=1, sampling_ratio=1)
 
-        gt_y = self.slow_ps_roi_align(x, rois, pool_h, pool_w, device,
-                                      spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
+        script_func = self.get_script_fn(rois)
 
-        y = ps_roi_align(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_align(x.permute(0, 1, 3, 2), rois, pool_h, pool_w,
-                                      device, spatial_scale=1, sampling_ratio=2,
-                                      dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIAlign layer incorrect')
+        self.assertTrue(gradcheck(func, (x,)))
+        self.assertTrue(gradcheck(script_func, (x,)))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                               sampling_ratio=-1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[8.125e-01, 6.875e-01, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [1.0416666667e-01, 6.25e-02, 0.0, 0.0, 0.0, ],
-                                  [5.2083333333e-01, 3.125e-01, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[8.3266726847e-17, 1.125e00, 3.750e-01, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [0.0, 3.4722222222e-02, 9.7222222222e-02, 3.4722222222e-02, 0.0, ],
-                                  [0.0, 1.7361111111e-01, 4.8611111111e-01, 1.7361111111e-01, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 5.000e-01, 4.375e-01, 5.000e-01, 6.25e-02, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 0.0, 0.0, 6.25e-02, 1.0416666667e-01, ],
-                                  [0.0, 0.0, 0.0, 3.125e-01, 5.2083333333e-01, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [5.4166666667e-01, 4.5833333333e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ],
-                                  [3.125e-01, 1.875e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [5.5511151231e-17, 7.500e-01, 2.500e-01, 0.0, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ],
-                                  [0.0, 1.0416666667e-01, 2.9166666667e-01, 1.0416666667e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 3.3333333333e-01, 2.9166666667e-01, 3.3333333333e-01, 4.1666666667e-02, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ],
-                                  [0.0, 0.0, 0.0, 1.875e-01, 3.125e-01, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7083333333e-01, 2.2916666667e-01, 0.0, 0.0, 0.0, ],
-                                  [7.2222222222e-01, 6.1111111111e-01, 0.0, 0.0, 0.0, ],
-                                  [7.1527777778e-01, 4.5138888889e-01, 0.0, 0.0, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [2.7755575616e-17, 3.750e-01, 1.250e-01, 0.0, 0.0, ],
-                                  [7.4014868308e-17, 1.000e00, 3.3333333333e-01, 0.0, 0.0, ],
-                                  [9.2518585385e-18, 3.3333333333e-01, 6.25e-01, 2.0833333333e-01, 0.0, ]],
-                                 [[0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 0.0, 0.0, 0.0, 0.0, ],
-                                  [0.0, 1.6666666667e-01, 1.4583333333e-01, 1.6666666667e-01, 2.0833333333e-02, ],
-                                  [0.0, 4.4444444444e-01, 3.8888888889e-01, 4.4444444444e-01, 5.5555555556e-02, ],
-                                  [0.0, 5.5555555556e-02, 4.8611111111e-02, 4.3055555556e-01, 6.3194444444e-01, ]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIAlign')
+    def fn(*args, **kwargs):
+        pass
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_align_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
+    def get_script_fn(*args, **kwargs):
+        pass
 
-        m = ops.PSRoIAlign((pool_size, pool_size), spatial_scale=1,
-                           sampling_ratio=2).to(dtype=self.dtype, device=device)
+    def expected_fn(*args, **kwargs):
+        pass
 
-        def func(input):
-            return m(input, rois)
 
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIAlign CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIAlign CUDA')
+class RoIPoolTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIPool((pool_h, pool_w), spatial_scale)(x, rois)
 
+    def get_script_fn(self, rois):
         @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 2.0, 1)[0]
-
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_align on CUDA')
-
-
-class PSRoIPoolTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dtype = torch.float64
+        def script_fn(input, rois):
+            return ops.roi_pool(input, rois, 5, 1.0)[0]
+        return lambda x: script_fn(x, rois)
 
-    def slow_ps_roi_pooling(self, x, rois, pool_h, pool_w, device, spatial_scale=1,
-                            dtype=torch.float64):
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
         if device is None:
             device = torch.device("cpu")
-        num_input_channels = x.size(1)
-        self.assertEqual(num_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
-        num_output_channels = int(num_input_channels / (pool_h * pool_w))
-        y = torch.zeros(rois.size(0), num_output_channels, pool_h, pool_w, dtype=dtype, device=device)
-
-        rois = torch.round(rois * spatial_scale).int()
-        for n in range(0, x.size(0)):
-            for r, roi in enumerate(rois):
-                if roi[0] != n:
-                    continue
-                c_in = 0
-                for c_out in range(0, num_output_channels):
-                    roi_height = max(roi[4].item() - roi[2].item(), 1)
-                    roi_width = max(roi[3].item() - roi[1].item(), 1)
-                    bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
-
-                    for j in range(0, pool_h):
-                        start_h = int(np.floor(j * bin_h)) + roi[2].item()
-                        end_h = int(np.ceil((j + 1) * bin_w)) + roi[2].item()
-
-                        # range-check
-                        start_h = min(max(start_h, 0), x.size(2))
-                        end_h = min(max(end_h, 0), x.size(2))
-
-                        for i in range(0, pool_w):
-                            start_w = int(np.floor(i * bin_w)) + roi[1].item()
-                            end_w = int(np.ceil((i + 1) * bin_w)) + roi[1].item()
-
-                            # range-check
-                            start_w = min(max(start_w, 0), x.size(3))
-                            end_w = min(max(end_w, 0), x.size(3))
-
-                            is_empty = (end_h <= start_h) or (end_w <= start_w)
-                            area = (end_h - start_h) * (end_w - start_w)
-
-                            if not is_empty:
-                                t = torch.sum(x[n, c_in, slice(start_h, end_h), slice(start_w, end_w)])
-                                y[r, c_out, j, i] = t / area
-                            c_in += 1
-        return y
-
-    def test_ps_roi_pool_basic_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
+        n_channels = x.size(1)
+        y = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)
 
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
+        def get_slice(k, block):
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))
 
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-    def test_ps_roi_pool_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
+        for roi_idx, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
-
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y, y), 'PSRoIPool layer incorrect on CPU')
-
-    def test_ps_roi_pool_gradient_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool on CPU')
-
-    def test_ps_roi_pool_gradcheck_cpu(self):
-        device = torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
+            roi_h, roi_w = roi_x.shape[-2:]
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
 
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
+            for i in range(0, pool_h):
+                for j in range(0, pool_w):
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
+                    if bin_x.numel() > 0:
+                        y[roi_idx, :, i, j] = bin_x.reshape(n_channels, -1).max(dim=1)[0]
+        return y
 
-        def func(input):
-            return m(input, rois)
 
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool on CPU')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool on CPU')
+class PSRoIPoolTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
 
+    def get_script_fn(self, rois):
         @torch.jit.script
-        def script_func(input, rois):
+        def script_fn(input, rois):
             return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
+        return lambda x: script_fn(x, rois)
 
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CPU')
+    def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
+        n_input_channels = x.size(1)
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
+        y = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+
+        def get_slice(k, block):
+            return slice(int(np.floor(k * block)), int(np.ceil((k + 1) * block)))
+
+        for roi_idx, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (int(round(x.item() * spatial_scale)) for x in roi[1:])
+            roi_x = x[batch_idx, :, i_begin:i_end + 1, j_begin:j_end + 1]
+
+            roi_height = max(i_end - i_begin, 1)
+            roi_width = max(j_end - j_begin, 1)
+            bin_h, bin_w = roi_height / float(pool_h), roi_width / float(pool_w)
+
+            for i in range(0, pool_h):
+                for j in range(0, pool_w):
+                    bin_x = roi_x[:, get_slice(i, bin_h), get_slice(j, bin_w)]
+                    if bin_x.numel() > 0:
+                        area = bin_x.size(-2) * bin_x.size(-1)
+                        for c_out in range(0, n_output_channels):
+                            c_in = c_out * (pool_h * pool_w) + pool_w * i + j
+                            t = torch.sum(bin_x[c_in, :, :])
+                            y[roi_idx, c_out, i, j] = t / area
+        return y
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_basic_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 4, 4]],  # format is (xyxy)
-                            dtype=self.dtype, device=device)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
+def bilinear_interpolate(data, height, width, y, x):
+    if y < -1.0 or y > height or x < -1.0 or x > width:
+        return 0.
 
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
+    y = min(max(0, y), height - 1)
+    x = min(max(0, x), width - 1)
 
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
+    y_low = int(y)
+    y_high = min(y_low + 1, height - 1)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_cuda(self):
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
-                            dtype=self.dtype, device=device)
+    x_low = int(x)
+    x_high = min(x_low + 1, width - 1)
 
-        pool_h, pool_w = (pool_size, pool_size)
-        ps_roi_pool = ops.PSRoIPool((pool_h, pool_w), 1)
-        y = ps_roi_pool(x, rois)
+    wy_h = y - y_low
+    wy_l = 1 - wy_h
 
-        gt_y = self.slow_ps_roi_pooling(x, rois, pool_h, pool_w, device, dtype=self.dtype)
+    wx_h = x - x_low
+    wx_l = 1 - wx_h
 
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
+    val = 0
+    for wx, x in zip((wx_l, wx_h), (x_low, x_high)):
+        for wy, y in zip((wy_l, wy_h), (y_low, y_high)):
+            val += wx * wy * data[y * width + x]
+    return val
 
-        y = ps_roi_pool(x.permute(0, 1, 3, 2), rois)
-        gt_y = self.slow_ps_roi_pooling(x.permute(0, 1, 3, 2), rois, pool_h, pool_w, device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(gt_y.cuda(), y), 'PSRoIPool layer incorrect')
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradient_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 3
-        layer = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
-        x = torch.ones(1, pool_size ** 2, 5, 5, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 4, 4],
-            [0, 0, 3, 5, 5],
-            [0, 1, 0, 2, 4]],
-            dtype=self.dtype, device=device)
-
-        y = layer(x, rois)
-        s = y.sum()
-        s.backward()
-        gt_grad = torch.tensor([[[[0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.2500, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000],
-                                  [0.0000, 1. / 6, 1. / 6, 1. / 6, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500],
-                                  [0.0000, 0.0000, 0.0000, 0.2500, 0.2500]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.2500, 0.7500, 0.0000, 0.0000, 0.0000],
-                                  [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 0.7500, 0.2500, 0.0000, 0.0000],
-                                  [0.0000, 1. / 3, 1. / 3, 1. / 3, 0.0000]],
-
-                                 [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.5000, 0.2500, 0.2500, 0.0000],
-                                  [0.0000, 0.0000, 0.0000, 0.5000, 0.5000]]]],
-                               device=device, dtype=self.dtype)
-        self.assertTrue(torch.allclose(x.grad, gt_grad), 'gradient incorrect for PSRoIPool')
+class RoIAlignTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.RoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
+                            sampling_ratio=sampling_ratio)(x, rois)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
-    def test_ps_roi_pool_gradcheck_cuda(self):
-        device = torch.device('cuda')
-        pool_size = 5
-        x = torch.rand(1, pool_size ** 2, 10, 10, dtype=self.dtype, device=device, requires_grad=True)
-        rois = torch.tensor([
-            [0, 0, 0, 9, 9],
-            [0, 0, 5, 5, 9],
-            [0, 5, 5, 9, 9]], dtype=self.dtype, device=device)
+    def get_script_fn(self, rois):
+        @torch.jit.script
+        def script_func(input, rois):
+            return ops.roi_align(input, rois, 5, 1.0)[0]
+        return lambda z: script_func(z, rois)
 
-        m = ops.PSRoIPool((pool_size, pool_size), 1).to(dtype=self.dtype, device=device)
+    def expected_fn(self, in_data, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
+                    device=None, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
+        n_channels = in_data.size(1)
+        out_data = torch.zeros(rois.size(0), n_channels, pool_h, pool_w, dtype=dtype, device=device)
+
+        for r, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale for x in roi[1:])
+
+            roi_h = i_end - i_begin
+            roi_w = j_end - j_begin
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
+
+            for i in range(0, pool_h):
+                start_h = i_begin + i * bin_h
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
+                for j in range(0, pool_w):
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
+
+                    for channel in range(0, n_channels):
+
+                        val = 0
+                        for iy in range(0, grid_h):
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
+                            for ix in range(0, grid_w):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
+                                val += bilinear_interpolate(
+                                    in_data[batch_idx, channel, :, :].flatten(),
+                                    in_data.size(-2),
+                                    in_data.size(-1),
+                                    y, x
+                                )
+                        val /= grid_h * grid_w
+
+                        out_data[r, channel, i, j] = val
+        return out_data
 
-        def func(input):
-            return m(input, rois)
 
-        self.assertTrue(gradcheck(func, (x,)), 'gradcheck failed for PSRoIPool CUDA')
-        self.assertTrue(gradcheck(func, (x.permute(0, 1, 3, 2),)), 'gradcheck failed for PSRoIPool CUDA')
+class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
+    def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
+        return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
+                              sampling_ratio=sampling_ratio)(x, rois)
 
+    def get_script_fn(self, rois):
         @torch.jit.script
         def script_func(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
+            return ops.ps_roi_align(input, rois, 5, 1.0)[0]
+        return lambda z: script_func(z, rois)
 
-        self.assertTrue(gradcheck(lambda x: script_func(x, rois), (x,)),
-                        'gradcheck failed for scripted ps_roi_pool on CUDA')
+    def expected_fn(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
+                    sampling_ratio=-1, dtype=torch.float64):
+        if device is None:
+            device = torch.device("cpu")
+        n_input_channels = in_data.size(1)
+        self.assertEqual(n_input_channels % (pool_h * pool_w), 0, "input channels must be divisible by ph * pw")
+        n_output_channels = int(n_input_channels / (pool_h * pool_w))
+        out_data = torch.zeros(rois.size(0), n_output_channels, pool_h, pool_w, dtype=dtype, device=device)
+
+        for r, roi in enumerate(rois):
+            batch_idx = int(roi[0])
+            j_begin, i_begin, j_end, i_end = (x.item() * spatial_scale - 0.5 for x in roi[1:])
+
+            roi_h = i_end - i_begin
+            roi_w = j_end - j_begin
+            bin_h = roi_h / pool_h
+            bin_w = roi_w / pool_w
+
+            for i in range(0, pool_h):
+                start_h = i_begin + i * bin_h
+                grid_h = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_h))
+                for j in range(0, pool_w):
+                    start_w = j_begin + j * bin_w
+                    grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
+                    for c_out in range(0, n_output_channels):
+                        c_in = c_out * (pool_h * pool_w) + pool_w * i + j
+
+                        val = 0
+                        for iy in range(0, grid_h):
+                            y = start_h + (iy + 0.5) * bin_h / grid_h
+                            for ix in range(0, grid_w):
+                                x = start_w + (ix + 0.5) * bin_w / grid_w
+                                val += bilinear_interpolate(
+                                    in_data[batch_idx, c_in, :, :].flatten(),
+                                    in_data.size(-2),
+                                    in_data.size(-1),
+                                    y, x
+                                )
+                        val /= grid_h * grid_w
+
+                        out_data[r, c_out, i, j] = val
+        return out_data
 
 
 class NMSTester(unittest.TestCase):

From 6439095aff594ff31f00e339a54a80d76de23003 Mon Sep 17 00:00:00 2001
From: Pedro Freire <pfr@fb.com>
Date: Mon, 4 Nov 2019 17:13:29 +0100
Subject: [PATCH 2/3] Reduce input size used in gradcheck.

gradcheck can be quite costly, and it was causing OOM errors and making
the tests slow. By reducing the size of the input, the test speed is
down to 3 seconds for the CPU tests.

Other points:
- We remove an unused namedtuple;
- We inherit from object for better Python 2 compatibility;
- We remove a hardcoded pool_size from the TorchScript functions, and
add it as a parameter instead.
---
 test/test_ops.py | 57 ++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 3eb9e9d78f6..f375daf16bc 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -8,11 +8,8 @@
 from itertools import product
 import unittest
 
-from collections import namedtuple
-Example = namedtuple('Example', ['x', 'rois'])
 
-
-class RoIOpTester:
+class RoIOpTester(object):
     @classmethod
     def setUpClass(cls):
         cls.dtype = torch.float64
@@ -66,23 +63,23 @@ def _test_forward(self, device, contiguous):
         self.assertTrue(torch.allclose(gt_y, y))
 
     def _test_backward(self, device, contiguous):
-        pool_size = 5
-        x = torch.rand(2, 2 * (pool_size ** 2), 10, 10, dtype=self.dtype, device=device, requires_grad=True)
+        pool_size = 2
+        x = torch.rand(1, 2 * (pool_size ** 2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
         if not contiguous:
             x = x.permute(0, 1, 3, 2)
-        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
-                             [0, 0, 5, 4, 9],
-                             [0, 5, 5, 9, 9],
-                             [1, 0, 0, 9, 9]],
+        rois = torch.tensor([[0, 0, 0, 4, 4],  # format is (xyxy)
+                             [0, 0, 2, 3, 4],
+                             [0, 2, 2, 4, 4]],
                             dtype=self.dtype, device=device)
 
         def func(z):
-            return self.fn(z, rois, 5, 5, spatial_scale=1, sampling_ratio=1)
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)
 
-        script_func = self.get_script_fn(rois)
+        script_func = self.get_script_fn(rois, pool_size)
 
         self.assertTrue(gradcheck(func, (x,)))
         self.assertTrue(gradcheck(script_func, (x,)))
+        return
 
     def fn(*args, **kwargs):
         pass
@@ -98,11 +95,12 @@ class RoIPoolTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.RoIPool((pool_h, pool_w), spatial_scale)(x, rois)
 
-    def get_script_fn(self, rois):
+    def get_script_fn(self, rois, pool_size):
         @torch.jit.script
-        def script_fn(input, rois):
-            return ops.roi_pool(input, rois, 5, 1.0)[0]
-        return lambda x: script_fn(x, rois)
+        def script_fn(input, rois, pool_size):
+            # type: (Tensor, Tensor, int) -> Tensor
+            return ops.roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
 
     def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
                     device=None, dtype=torch.float64):
@@ -136,11 +134,12 @@ class PSRoIPoolTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
 
-    def get_script_fn(self, rois):
+    def get_script_fn(self, rois, pool_size):
         @torch.jit.script
-        def script_fn(input, rois):
-            return ops.ps_roi_pool(input, rois, 5, 1.0)[0]
-        return lambda x: script_fn(x, rois)
+        def script_fn(input, rois, pool_size):
+            # type: (Tensor, Tensor, int) -> Tensor
+            return ops.ps_roi_pool(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
 
     def expected_fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
                     device=None, dtype=torch.float64):
@@ -206,11 +205,12 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
         return ops.RoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
                             sampling_ratio=sampling_ratio)(x, rois)
 
-    def get_script_fn(self, rois):
+    def get_script_fn(self, rois, pool_size):
         @torch.jit.script
-        def script_func(input, rois):
-            return ops.roi_align(input, rois, 5, 1.0)[0]
-        return lambda z: script_func(z, rois)
+        def script_fn(input, rois, pool_size):
+            # type: (Tensor, Tensor, int) -> Tensor
+            return ops.roi_align(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
 
     def expected_fn(self, in_data, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1,
                     device=None, dtype=torch.float64):
@@ -259,11 +259,12 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
         return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale,
                               sampling_ratio=sampling_ratio)(x, rois)
 
-    def get_script_fn(self, rois):
+    def get_script_fn(self, rois, pool_size):
         @torch.jit.script
-        def script_func(input, rois):
-            return ops.ps_roi_align(input, rois, 5, 1.0)[0]
-        return lambda z: script_func(z, rois)
+        def script_fn(input, rois, pool_size):
+            # type: (Tensor, Tensor, int) -> Tensor
+            return ops.ps_roi_align(input, rois, pool_size, 1.0)[0]
+        return lambda x: script_fn(x, rois, pool_size)
 
     def expected_fn(self, in_data, rois, pool_h, pool_w, device, spatial_scale=1,
                     sampling_ratio=-1, dtype=torch.float64):

From 765b8912d8952043b874675d3b3de5387975adcb Mon Sep 17 00:00:00 2001
From: Pedro Freire <pfr@fb.com>
Date: Mon, 4 Nov 2019 18:53:01 +0100
Subject: [PATCH 3/3] Replace Tensor by torch.Tensor in type annotations.

This should fix lint errors.
---
 test/test_ops.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index f375daf16bc..c4cc3fe0bd6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -98,7 +98,7 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
     def get_script_fn(self, rois, pool_size):
         @torch.jit.script
         def script_fn(input, rois, pool_size):
-            # type: (Tensor, Tensor, int) -> Tensor
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
             return ops.roi_pool(input, rois, pool_size, 1.0)[0]
         return lambda x: script_fn(x, rois, pool_size)
 
@@ -137,7 +137,7 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
     def get_script_fn(self, rois, pool_size):
         @torch.jit.script
         def script_fn(input, rois, pool_size):
-            # type: (Tensor, Tensor, int) -> Tensor
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
             return ops.ps_roi_pool(input, rois, pool_size, 1.0)[0]
         return lambda x: script_fn(x, rois, pool_size)
 
@@ -208,7 +208,7 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
     def get_script_fn(self, rois, pool_size):
         @torch.jit.script
         def script_fn(input, rois, pool_size):
-            # type: (Tensor, Tensor, int) -> Tensor
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
             return ops.roi_align(input, rois, pool_size, 1.0)[0]
         return lambda x: script_fn(x, rois, pool_size)
 
@@ -262,7 +262,7 @@ def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwar
     def get_script_fn(self, rois, pool_size):
         @torch.jit.script
         def script_fn(input, rois, pool_size):
-            # type: (Tensor, Tensor, int) -> Tensor
+            # type: (torch.Tensor, torch.Tensor, int) -> torch.Tensor
             return ops.ps_roi_align(input, rois, pool_size, 1.0)[0]
         return lambda x: script_fn(x, rois, pool_size)