[inductor] Prevent blowup in inner_fn_str and extract_read_writes (pytorch#88933)

peterbell10 · pytorchmergebot · commit 81f351acd7e0 · 2022-12-15T15:36:52.000Z
Currently the default `ops` handler expects strings as arguments and just formats them into a function call template string. For complex expressions, this can lead to exponential growth in terms. Say for example you have: ```python def fn(a): for _ in range(3) a = ops.mul(a, a) return a ``` You might expect `inner_fn_str` to contain 1 load and 3 multiplies, but instead you find 8 loads and 7 multiplies: ```python load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) * load(arg_0, i0) ``` This type of blowup is present in the lowering for `max_pool2d_with_indices_backward` which in #pytorch/torchdynamo#1352 was reported to have caused the entire compilation to hang. This PR fixes the issue by formatting the string as a series of assignments to variables, so for the example above, we now get: ``` tmp0 = load(arg_0, i0) tmp1 = tmp0 * tmp0 tmp2 = tmp1 * tmp1 tmp3 = tmp2 * tmp2 return tmp3 ``` Which corresponds to sequence of `ops` calls made. Pull Request resolved: pytorch#88933 Approved by: https://github.com/jansel
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -5354,7 +5354,7 @@ def fn(x1, x2):
                 traced = make_fx(fn)(x1, x2)
                 compiled = compile_fx_inner(traced, [x1, x2])
                 assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                assert metrics.generated_cpp_vec_kernel_count == 0
 
                 torch._dynamo.reset()
                 metrics.reset()
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
@@ -363,8 +363,6 @@ def process(device_type):
     "nn.functional.local_response_norm": {f16},
     "outer": {f16},
     "quantile": {f32, f64},
-    "scatter_reduce.amax": {f16, f32, f64},
-    "scatter_reduce.amin": {f16, f32, f64},
     "tanh": {f16},
 }
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
@@ -2,19 +2,24 @@
 import contextlib
 import itertools
 import logging
-import math
 import re
-import textwrap
 import typing
 from collections import namedtuple
-from io import StringIO
 from itertools import chain
 
 import sympy
 from sympy.printing.printer import Printer
 
 from .. import metrics
-from ..utils import free_symbol_startswith, sympy_dot, sympy_subs, sympy_symbol, unique
+from ..utils import (
+    DeferredLineBase,
+    free_symbol_startswith,
+    IndentedBuffer,
+    sympy_dot,
+    sympy_subs,
+    sympy_symbol,
+    unique,
+)
 from ..virtualized import ops, V
 
 log = logging.getLogger(__name__)
@@ -125,102 +130,12 @@ def remainder(a, b):
         return ops.where(f"(({r} != 0) & (({r} < 0) != ({b} < 0)))", ops.add(r, b), r)
 
 
-class IndentedBuffer:
-    tabwidth = 4
-
-    def __init__(self, initial_indent=0):
-        self._lines = []
-        self._indent = initial_indent
-
-    def getvalue(
-        self,
-    ):
-        buf = StringIO()
-        for line in self._lines:
-            if isinstance(line, DeferredLine):
-                line = line()
-                if line is None:
-                    continue
-            assert isinstance(line, str)
-            buf.write(line)
-            buf.write("\n")
-        return buf.getvalue()
-
-    def getrawvalue(self):
-        buf = StringIO()
-        for line in self._lines:
-            if isinstance(line, DeferredLine):
-                line = line()
-                if line is None:
-                    continue
-            assert isinstance(line, str)
-            # backslash implies line continuation
-            if line.endswith("\\"):
-                buf.write(line[:-1])
-            else:
-                buf.write(line)
-                buf.write("\n")
-        return buf.getvalue()
-
-    def clear(self):
-        self._lines.clear()
-
-    def __bool__(self):
-        return bool(self._lines)
-
-    def prefix(self):
-        return " " * (self._indent * self.tabwidth)
-
-    def writeline(self, line):
-        if isinstance(line, DeferredLine):
-            self._lines.append(line.with_prefix(self.prefix()))
-        elif line.strip():
-            self._lines.append(f"{self.prefix()}{line}")
-        else:
-            self._lines.append("")
-
-    def writelines(self, lines):
-        for line in lines:
-            self.writeline(line)
-
-    def indent(self, offset=1):
-        @contextlib.contextmanager
-        def ctx():
-            self._indent += offset
-            yield
-            self._indent -= offset
-
-        return ctx()
-
-    def splice(self, other_code, strip=False):
-        if isinstance(other_code, IndentedBuffer):
-            dedent = float("inf")
-            for line in other_code._lines:
-                if line:
-                    dedent = min(dedent, len(line) - len(line.lstrip()))
-            if math.isinf(dedent):
-                dedent = 0
-            for line in other_code._lines:
-                IndentedBuffer.writeline(self, line[dedent:])
-        else:
-            other_code = textwrap.dedent(other_code)
-            if strip:
-                other_code = other_code.lstrip()
-            if not other_code:
-                return
-            other_code = other_code.rstrip()
-            for line in other_code.split("\n"):
-                self.writeline(line)
-
-
-class DeferredLine:
+class DeferredLine(DeferredLineBase):
     """A line that can be 'unwritten' by adding name to V.graph.removed_buffers"""
 
     def __init__(self, name, line):
-        if not line.strip():
-            line = ""
+        super().__init__(line)
         self.name = name
-        self.line = line
 
     def __call__(self):
         if (
@@ -230,20 +145,8 @@ def __call__(self):
             return self.line
         return None
 
-    def with_prefix(self, prefix):
-        return DeferredLine(self.name, f"{prefix}{self.line}")
-
-    def lstrip(self):
-        return DeferredLine(self.name, self.line.lstrip())
-
-    def __getitem__(self, index):
-        return DeferredLine(self.name, self.line[index])
-
-    def __bool__(self):
-        return bool(self.line)
-
-    def __len__(self):
-        return len(self.line)
+    def _new_line(self, line):
+        return DeferredLine(self.name, line)
 
 
 class DeferredIndentedBuffer(IndentedBuffer):
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
@@ -7,7 +7,6 @@
 
 import sympy
 
-from . import config
 from .codegen.common import index_prevent_reordering
 from .utils import (
     get_dtype_size,
@@ -165,24 +164,15 @@ def merge(self, other):
         )
 
 
-class RecordLoadStore(V.MockHandler):  # type: ignore[name-defined]
+class _RecordLoadStoreInner(V.MockHandler):
     def __init__(self, var_ranges: VarRanges, normalize: bool):
-        super(RecordLoadStore, self).__init__()
+        super().__init__()
         self._reads: Set[MemoryDep] = set()
         self._writes: Set[MemoryDep] = set()
         self._index_exprs: Set[IndexExprDep] = set()
         self._var_ranges: VarRanges = var_ranges
         self._normalize: bool = normalize
 
-    # Truncate the expr str by a threshold to prevent it's too long
-    # and cause process hanging. The result is not used.
-    # https://github.com/pytorch/torchdynamo/issues/1352
-    @staticmethod
-    def truncate_expr(expr):
-        if len(expr) > config.realize_bytes_threshold:
-            expr = f"{expr[:config.realize_bytes_threshold]}..."
-        return expr
-
     def canonicalize(
         self, index: sympy.Expr
     ) -> Tuple[sympy.Expr, Tuple[sympy.Expr, ...]]:
@@ -230,6 +220,14 @@ def index_expr(self, index: sympy.Expr, dtype) -> str:
         return f"index_expr({sympy_str(index)}, {dtype})"
 
 
+class RecordLoadStore(V.KernelFormatterHandler):
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        parent_handler = _RecordLoadStoreInner(
+            var_ranges=var_ranges, normalize=normalize
+        )
+        super().__init__(parent_handler=parent_handler)
+
+
 def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
     cnt = itertools.count()
     var_ranges: VarRanges = collections.OrderedDict()
@@ -279,8 +277,13 @@ def extract_read_writes(
     else:
         range_vars = [*itertools.chain(*args)]
 
+    inner = rw.parent_handler
     return ReadWrites(
-        set(rw._reads), set(rw._writes), rw._index_exprs, range_vars, var_ranges
+        set(inner._reads),
+        set(inner._writes),
+        inner._index_exprs,
+        range_vars,
+        var_ranges,
     )
 
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -357,13 +357,12 @@ def _index(ranges, prefix="i"):
 
     @cache_on_self
     def inner_fn_str(self):
-        try:
-            with V.set_ops_handler(V.MockHandler()), patch.object(
-                FlexibleLayout, "allow_indexing", True
-            ):
-                return str(self.inner_fn(self._index(self.ranges)))
-        except Exception as e:
-            return f"inner_fn(): {e}"
+        formatter = V.KernelFormatterHandler(V.MockHandler())
+        with V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = self.inner_fn(self._index(self.ranges))
+            return formatter.getvalue(result)
 
     def is_zero_elements(self):
         return any(r == 0 for r in self.ranges)
@@ -479,18 +478,15 @@ def index_length(self):
 
     @cache_on_self
     def inner_fn_str(self):
-        try:
-            with V.set_ops_handler(V.MockHandler()), patch.object(
-                FlexibleLayout, "allow_indexing", True
-            ):
-                return str(
-                    self.inner_fn(
-                        self._index(self.ranges),
-                        self._index(self.reduction_ranges, "r"),
-                    )
-                )
-        except Exception as e:
-            return f"inner_fn(): {e}"
+        formatter = V.KernelFormatterHandler(MockHandler())
+        with V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = self.inner_fn(
+                self._index(self.ranges),
+                self._index(self.reduction_ranges, "r"),
+            )
+            return formatter.getvalue(result)
 
     def constant_to_device(self, device):
         """Move this to a given device. Requires that all reads are to constants."""
@@ -3948,7 +3944,7 @@ def should_realize_on_cpu(loops: Union[Pointwise, Reduction]):
             """
             heavy_ops = ["exp"]  # a list of heavy ops
             fn_str = loops.inner_fn_str()
-            return any([fn_str.startswith(op + "(") for op in heavy_ops])
+            return any([(op + "(") in fn_str for op in heavy_ops])
 
         if (
             users > 1
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py

Original file line number	Diff line number	Diff line change
`@@ -363,8 +363,6 @@ def process(device_type):`
`363`	`363`	`"nn.functional.local_response_norm": {f16},`
`364`	`364`	`"outer": {f16},`
`365`	`365`	`"quantile": {f32, f64},`
`366`		`- "scatter_reduce.amax": {f16, f32, f64},`
`367`		`- "scatter_reduce.amin": {f16, f32, f64},`
`368`	`366`	`"tanh": {f16},`
`369`	`367`	`}`
`370`	`368`