Reduce overhead of AOT Module (#660)

anijain2305 · web-flow · commit 2489be5bb0eb · 2022-04-05T17:15:46.000-07:00
Adding aot_module_simplified and aot_function_simplified

Fallback to aot_module original until we prevent tracing of leaf modules
diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
@@ -527,5 +527,79 @@ def forward(self, *args, **kwargs):
     return AOTModule()
 
 
+def aot_module_simplified(mod: nn.Module, *top_args, **top_kwargs) -> nn.Module:
+    """
+    This is the simplified or low overhead version of aot_module. For frontends
+    like TorchDynamo, the input functions/modules to AOT are static and have
+    unpacked inputs/outputs. This gives us an opportunity to remove the
+        (1) pytree overhead to parse inputs/outputs,
+        (2) AOT Autograd cache,
+        (3) Reading of params/buffers in every forward call
+
+    :func:`aot_module_simplified` removes these overheads.
+    """
+    #########################################################
+
+    params = {
+        **dict(_named_parameters(mod, remove_duplicate=False)),
+        **dict(_named_buffers(mod, remove_duplicate=False)),
+    }
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_flat = tuple(params_flat)
+    params_len = len(params_flat)
+
+    def functional_call(*args, **kwargs):
+        with _stateless.reparametrize_module(
+            mod, pytree.tree_unflatten(args[:params_len], params_spec)
+        ):
+            out = mod(*args[params_len:], **kwargs)
+        if not isinstance(out, (tuple, list)):
+            raise RuntimeError(
+                "Graph output must be a tuple(). This is so that we can avoid "
+                "pytree processing of the ouputs. Please change the module to "
+                "have tuple outputs or use aot_module instead."
+            )
+        return out
+
+    def aot_function_simplified(
+        fn: Callable,
+        fw_compiler: Callable,
+        bw_compiler: Optional[Callable] = None,
+        partition_fn: Callable = default_partition,
+        decompositions: Dict = {},
+        hasher_type: str = "StaticShapeHasher",
+        static_argnums: Optional[Tuple[int]] = None,
+    ) -> Callable:
+        assert static_argnums is None
+        if bw_compiler is None:
+            bw_compiler = fw_compiler
+        compiled_fn = create_aot_autograd_function(
+            fn,
+            fw_compiler,
+            bw_compiler,
+            partition_fn,
+            decompositions,
+            grad_state=torch.is_grad_enabled(),
+        ).apply
+
+        return compiled_fn
+
+    compiled_f = aot_function_simplified(functional_call, *top_args, **top_kwargs)
+
+    class AOTModule(nn.Module):
+        def __init__(self):
+            super(AOTModule, self).__init__()
+            self.orig_module = mod
+
+        def forward(self, *args, **kwargs):
+            return compiled_f(
+                *params_flat,
+                *args,
+                **kwargs,
+            )
+
+    return AOTModule()
+
+
 compiled_function = aot_function
 compiled_module = aot_module
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
@@ -8,6 +8,7 @@
     compiled_module,
     num_of_recompilations,
     clear_compile_cache,
+    aot_module_simplified,
 )
 from .._src.compilers import (
     ts_compile,
diff --git a/test/test_pythonkey.py b/test/test_pythonkey.py
@@ -17,6 +17,7 @@
     grad, vjp, vmap, jacrev,
     make_fx
 )
+from functorch._src.aot_autograd import aot_module_simplified
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
     min_cut_rematerialization_partition, aot_function, aot_module, decomposition_table, nop,
@@ -540,6 +541,37 @@ def f(x):
         torch.autograd.grad(out, inp, torch.randn(3, 2))
 
 
+class TestAOTModuleSimplified(TestCase):
+    def test_aot_module_simplified(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(20, 30)
+
+            def forward(self, x, y):
+                return (self.linear(x) + y, )
+
+        mod = MockModule()
+        mod.zero_grad()
+
+        x = torch.randn(128, 20, requires_grad=True)
+        y = torch.randn(128, 30, requires_grad=True)
+        inputs = [x, y]
+        cloned_inputs = [x.detach().clone().requires_grad_(True) for x in inputs]
+
+        ref = mod(*inputs)
+        ref[0].sum().backward()
+
+        aot_mod = aot_module_simplified(mod, nop)
+        aot_mod.zero_grad()
+        res = aot_mod(*cloned_inputs)
+        res[0].sum().backward()
+
+        assert torch.allclose(ref[0], res[0])
+        assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
+        assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
+
+
 only_for = ("cpu")
 instantiate_device_type_tests(
     TestPythonKey,

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`	`compiled_module,`
`9`	`9`	`num_of_recompilations,`
`10`	`10`	`clear_compile_cache,`
	`11`	`+ aot_module_simplified,`
`11`	`12`	`)`
`12`	`13`	`from .._src.compilers import (`
`13`	`14`	`ts_compile,`