pytorch · eellison · Aug 29, 2022 · Aug 24, 2022 · Aug 25, 2022 · Aug 25, 2022
diff --git a/test/test_torchinductor.py b/test/test_torchinductor.py
@@ -1733,6 +1733,15 @@ def fn(x):
             (torch.randn([64]),),
         )
 
+    def test_flip(self):
+        def fn(x):
+            return torch.flip(x, (-1,)), torch.flip(x, (0, 2)) - 2
+
+        self.common(
+            fn,
+            (torch.randn([1, 2, 6, 6]),),
+        )
+
     def test_log2(self):
         def fn(x):
             return torch.log2(x), torch.log2(x + 1) - 2

diff --git a/torchinductor/codegen/cpp.py b/torchinductor/codegen/cpp.py
@@ -190,6 +190,10 @@ def exp(x):
     def sqrt(x):
         return f"std::sqrt({x})"
 
+    @staticmethod
+    def rsqrt(x):
+        return f"1 / std::sqrt({x})"
+
     @staticmethod
     def pow(a, b):
         return f"std::pow({a}, {b})"

diff --git a/torchinductor/codegen/triton.py b/torchinductor/codegen/triton.py
@@ -158,6 +158,10 @@ def rand(seed, offset, _):  # _ here to keep the contract identical to CPU rand
     def randn(seed, offset, _):  # _ here to keep the contract identical to CPU randn op
         return f"tl.randn({seed}, {offset})"
 
+    @staticmethod
+    def rsqrt(x):
+        return f"tl.libdevice.rsqrt({x})"
+
     @staticmethod
     def pow(a, b):
         return f"tl.libdevice.pow({a}, {b})"

diff --git a/torchinductor/decomposition.py b/torchinductor/decomposition.py
@@ -83,6 +83,7 @@
         aten.tanh_backward,
         aten.threshold_backward,
         aten.transpose.int,
+        aten.tril.default,
         aten.upsample_nearest2d_backward,
         aten.upsample_bilinear2d.vec,
     ]

diff --git a/torchinductor/lowering.py b/torchinductor/lowering.py
@@ -1907,6 +1907,28 @@ def accumulate(out_x, out_y, index_range1, index_range2=None):
     )
 
 
+@register_lowering(prims.rev.default)
+def rev(x, dims):
+    # note - dims pre-canoncalized
+    x_loader = x.make_loader()
+    sizes = x.get_size()
+
+    def loader(idx):
+        idx = list(idx)
+        assert len(idx) == len(sizes)
+        for dim in dims:
+            idx[dim] = (sizes[dim] - 1) - idx[dim]
+
+        return x_loader(idx)
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=loader,
+        ranges=sizes,
+    )
+
+
 @register_lowering(aten.constant_pad_nd, type_promote=False)
 def constant_pad_nd(x, padding, fill_value=0):
     assert (len(padding) % 2) == 0
@@ -2829,6 +2851,20 @@ def fn(*args):
     )
 
 
+# TODO - enable builtin and disable decomp to lower to ptx instruction
+# Causes compilation to not complete on timm_vision_transformers inference
+# @register_lowering(aten.rsqrt)
+# def rsqrt(x):
+#     dtype = x.get_dtype()
+#     if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
+#         x = to_dtype(x, torch.get_default_dtype())
+#
+#     def _rsqrt(x):
+#         return ops.rsqrt(x)
+#
+#     return make_pointwise(_rsqrt)(x)
+
+
 @register_lowering([aten.sum, prims.sum])
 def sum_(x, axis=None, keepdims=False, *, dtype=None):
     if (