feat: Add support for exempting full-support blocks

gs-olive · gs-olive · commit 0f3e4604290e · 2023-08-22T17:29:40.000-07:00
- When a graph is fully supported, we can ignore the minimum block size
argument, which is primarily helpful in reducing segmentation. If the
minimum block size is above the number of total operators in the graph,
and we support all of those, the whole graph will run in Torch
regardless. As a result, we can exempt fully supported graphs from the
min block size requirement
- Alternatively, if preferable, we can display a warning in such a case,
but still respect the minimum block size argument

refactor: Add require_full_compilation in Dynamo

- Add support for full compilation compilation argument in Dynamo paths

feat: Disable require_full_compilation in torch compile
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -12,3 +12,4 @@
 USE_PYTHON_RUNTIME = False
 USE_FAST_PARTITIONER = True
 ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False
+REQUIRE_FULL_COMPILATION = False
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -10,6 +10,7 @@
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     PRECISION,
+    REQUIRE_FULL_COMPILATION,
     TRUNCATE_LONG_AND_DOUBLE,
     USE_FAST_PARTITIONER,
     USE_PYTHON_RUNTIME,
@@ -54,3 +55,4 @@ class CompilationSettings:
     truncate_long_and_double: bool = TRUNCATE_LONG_AND_DOUBLE
     use_fast_partitioner: bool = USE_FAST_PARTITIONER
     enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS
+    require_full_compilation: bool = REQUIRE_FULL_COMPILATION
diff --git a/py/torch_tensorrt/dynamo/compile.py b/py/torch_tensorrt/dynamo/compile.py
@@ -19,6 +19,7 @@
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     PRECISION,
+    REQUIRE_FULL_COMPILATION,
     TRUNCATE_LONG_AND_DOUBLE,
     USE_FAST_PARTITIONER,
     USE_PYTHON_RUNTIME,
@@ -52,7 +53,7 @@ def compile(
     dla_global_dram_size: int = 536870912,
     calibrator: object = None,
     truncate_long_and_double: bool = TRUNCATE_LONG_AND_DOUBLE,
-    require_full_compilation: bool = False,
+    require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     min_block_size: int = MIN_BLOCK_SIZE,
     torch_executed_ops: Optional[List[str]] = None,
     torch_executed_modules: Optional[List[str]] = None,
@@ -75,8 +76,10 @@ def compile(
         "The Dynamo backend is an experimental feature, for which only the "
         "following arguments are supported: "
         "{enabled_precisions, debug, workspace_size, min_block_size, "
-        "torch_executed_ops, pass_through_build_failures, use_fast_partitioner, "
-        "enable_experimental_decompositions}"
+        "max_aux_streams, version_compatible, optimization_level, "
+        "torch_executed_ops, pass_through_build_failures, "
+        "use_fast_partitioner, enable_experimental_decompositions, "
+        "require_full_compilation}"
     )
 
     if not isinstance(inputs, collections.abc.Sequence):
@@ -118,6 +121,7 @@ def compile(
         "truncate_long_and_double": truncate_long_and_double,
         "use_fast_partitioner": use_fast_partitioner,
         "enable_experimental_decompositions": enable_experimental_decompositions,
+        "require_full_compilation": require_full_compilation,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -12,7 +12,11 @@
     _SplitterSettingBase,
 )
 from torch.fx.passes.tools_common import CALLABLE_NODE_OPS, NodeSet
-from torch_tensorrt.dynamo._defaults import DEBUG, MIN_BLOCK_SIZE
+from torch_tensorrt.dynamo._defaults import (
+    DEBUG,
+    MIN_BLOCK_SIZE,
+    REQUIRE_FULL_COMPILATION,
+)
 from torch_tensorrt.dynamo.conversion.converter_registry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
@@ -92,6 +96,7 @@ class TRTPartitioner(_SplitterBase):  # type: ignore
         allowed_single_node_partition_ops: Nodes which can be included in single-node partitons.
             Generally useful for module-level exclusion ops which are intensive despite being single functions
         min_block_size: Minimum number of computational operators per block
+        require_full_compilation: Require that all computational operators be run in TRT
     Returns:
         torch.fx.GraphModule
     """
@@ -104,6 +109,7 @@ def __init__(
             Collection[str]
         ] = DEFAULT_SINGLE_NODE_PARTITIONS,
         min_block_size: int = MIN_BLOCK_SIZE,
+        require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     ):
         """
         Preprocesses graph before splitting:
@@ -142,6 +148,7 @@ def __init__(
 
         self.num_trt_accelerated_subgraphs: Optional[int] = None
         self.allowed_single_node_partition_ops = allowed_single_node_partition_ops
+        self.require_full_compilation = require_full_compilation
 
     def remove_small_acc_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph]:
         """
@@ -151,12 +158,16 @@ def remove_small_acc_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph
         result: List[Subgraph] = []
         for subgraph in subgraphs:
             if subgraph.is_acc:
-                if len(subgraph.nodes) >= self.settings.min_acc_module_size or (
-                    self.allowed_single_node_partition_ops is not None
-                    and any(
-                        ConverterRegistry.qualified_name_or_str(node.target)
-                        in self.allowed_single_node_partition_ops
-                        for node in subgraph.nodes
+                if (
+                    len(subgraph.nodes) >= self.settings.min_acc_module_size
+                    or self.require_full_compilation
+                    or (
+                        self.allowed_single_node_partition_ops is not None
+                        and any(
+                            ConverterRegistry.qualified_name_or_str(node.target)
+                            in self.allowed_single_node_partition_ops
+                            for node in subgraph.nodes
+                        )
                     )
                 ):
                     result.append(subgraph)
@@ -185,6 +196,27 @@ def partition_graph(self) -> torch.fx.GraphModule:
         # Delegate nodes based on operator coverage
         subgraphs = self.put_nodes_into_subgraphs()
 
+        # A graph is fully supported if there is a single partition and all operators are supported/convertible
+        full_support = len([s for s in subgraphs if s.is_acc]) == 1 and not getattr(
+            self.operator_support, "unsupported_operators", True
+        )
+
+        if not full_support and self.require_full_compilation:
+            raise AssertionError(
+                "require_full_compilation=True was specified, but model is not fully supported"
+            )
+
+        if (
+            full_support
+            and self.require_full_compilation
+            and self.settings.min_acc_module_size != MIN_BLOCK_SIZE
+        ):
+            logger.warning(
+                "Detected both require_full_compilation and min_block_size compilation "
+                "arguments were specified. Disregarding min_block_size argument for "
+                "fully supported model."
+            )
+
         # Remove segments smaller than the block size (with exceptions)
         subgraphs = self.remove_small_acc_subgraphs(subgraphs)
 
@@ -217,6 +249,7 @@ def partition(
     verbose: bool = DEBUG,
     min_block_size: int = MIN_BLOCK_SIZE,
     torch_executed_ops: Collection[Target] = set(),
+    require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
 ) -> torch.fx.GraphModule:
     """Partition an FX GraphModule with aten ops into TRT engines
     Partitioning is based on converter operator support
@@ -226,6 +259,7 @@ def partition(
         verbose: Bool representing whether to print operator support
         min_block_size: Minimum number of operators per TRT-Engine Block
         torch_executed_ops: Collection of operations to run in Torch, regardless of converter coverage
+        require_full_compilation: Require that all computational operators be run in TRT
     Returns:
         torch.fx.GraphModule
     """
@@ -236,7 +270,12 @@ def partition(
 
     # Construct
     supported_ops = OpSupportTester(torch_executed_ops=torch_executed_ops)
-    partitioner = TRTPartitioner(gm, supported_ops, min_block_size=min_block_size)
+    partitioner = TRTPartitioner(
+        gm,
+        supported_ops,
+        min_block_size=min_block_size,
+        require_full_compilation=require_full_compilation,
+    )
 
     partitioned_graph = partitioner.partition_graph()
 
diff --git a/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py
@@ -5,7 +5,11 @@
 from torch.fx.graph_module import GraphModule
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
 from torch.fx.passes.operator_support import OperatorSupport, SupportDict
-from torch_tensorrt.dynamo._defaults import DEBUG, MIN_BLOCK_SIZE
+from torch_tensorrt.dynamo._defaults import (
+    DEBUG,
+    MIN_BLOCK_SIZE,
+    REQUIRE_FULL_COMPILATION,
+)
 from torch_tensorrt.dynamo.conversion.converter_registry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
@@ -26,6 +30,7 @@ class TRTPartitioner(CapabilityBasedPartitioner):  # type: ignore[misc]
         allowed_single_node_partition_ops: Nodes which can be included in single-node partitons.
             Generally useful for module-level exclusion ops which are intensive despite being single functions
         min_block_size: Minimum number of computational operators per block
+        require_full_compilation: Require that all computational operators be run in TRT
     Returns:
         torch.fx.GraphModule
     """
@@ -40,6 +45,7 @@ def __init__(
             Collection[str]
         ] = DEFAULT_SINGLE_NODE_PARTITIONS,
         min_block_size: int = MIN_BLOCK_SIZE,
+        require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     ) -> None:
         super().__init__(
             graph_module,
@@ -50,12 +56,34 @@ def __init__(
         )
 
         self.min_block_size = min_block_size
+        self.require_full_compilation = require_full_compilation
 
     def propose_partitions(self) -> List[Partition]:
         # Propose partitions using the default, then refine the results
         initial_proposed_partitions = super().propose_partitions()
         partitions = dict(enumerate(initial_proposed_partitions))
 
+        # A graph is fully supported if there is a single partition and all operators are supported/convertible
+        full_support = len(partitions) == 1 and not getattr(
+            self.operator_support, "unsupported_operators", True
+        )
+
+        if not full_support and self.require_full_compilation:
+            raise AssertionError(
+                "require_full_compilation=True was specified, but model is not fully supported"
+            )
+
+        if (
+            full_support
+            and self.require_full_compilation
+            and self.min_block_size != MIN_BLOCK_SIZE
+        ):
+            logger.warning(
+                "Detected both require_full_compilation and min_block_size compilation "
+                "arguments were specified. Disregarding min_block_size argument for "
+                "fully supported model."
+            )
+
         # For each partition, determine whether or not the number of computational operators
         # exceeds the threshold, and if not, remove that partition
         partitions_to_remove = {}
@@ -81,7 +109,11 @@ def propose_partitions(self) -> List[Partition]:
                 ):
                     compute_node_count += 1
 
-            if compute_node_count < self.min_block_size and not exempted_partition:
+            if (
+                compute_node_count < self.min_block_size
+                and not exempted_partition
+                and not (full_support and self.require_full_compilation)
+            ):
                 partitions_to_remove[id] = compute_node_count
 
         # Remove any nodes violating the criteria specified by the user
@@ -172,6 +204,7 @@ def partition(
     verbose: bool = DEBUG,
     min_block_size: int = MIN_BLOCK_SIZE,
     torch_executed_ops: Optional[Set[str]] = None,
+    require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
 ) -> torch.fx.GraphModule:
     """Partition an FX GraphModule with aten ops into TRT engines
     Partitioning is based on converter operator support
@@ -181,6 +214,7 @@ def partition(
         verbose: Bool representing whether to print operator support
         min_block_size: Minimum number of operators per TRT-Engine Block
         torch_executed_ops: Sequence of operations to run in Torch, regardless of converter coverage
+        require_full_compilation: Whether to require that all operators be run in TRT
     Returns:
         torch.fx.GraphModule
     """
@@ -189,7 +223,12 @@ def partition(
         if torch_executed_ops is not None
         else set()
     )
-    partitioner = TRTPartitioner(gm, supported_ops, min_block_size=min_block_size)
+    partitioner = TRTPartitioner(
+        gm,
+        supported_ops,
+        min_block_size=min_block_size,
+        require_full_compilation=require_full_compilation,
+    )
 
     # Determine partitions based on user specifications and operator support
     # Then, fuse partitions and display overview of supported/unsupported operators
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -184,6 +184,14 @@ def parse_dynamo_kwargs(kwargs: Any) -> CompilationSettings:
     # Parse input runtime specification
     settings.use_python_runtime = use_python_runtime_parser(settings.use_python_runtime)
 
+    # Ignore and warn about require_full_compilation flag
+    if settings.require_full_compilation:
+        logger.warning(
+            "Detected require_full_compilation=True for a torch.compile run. "
+            "This option has no effect in torch.compile."
+        )
+        settings.require_full_compilation = False
+
     logger.info("Compilation Settings: %s\n", settings)
 
     return settings
diff --git a/tests/py/dynamo/partitioning/test_fast_partitioning.py b/tests/py/dynamo/partitioning/test_fast_partitioning.py
@@ -31,6 +31,52 @@ def forward(self, x, y):
             "Single operators should not be segmented",
         )
 
+    def test_partition_fully_supported_one_op_require_full_compilation(self):
+        class FullySupportedOneOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                return torch.ops.aten.add.Tensor(x, y)
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedOneOp())
+        partitioned_graph = partitioning.fast_partition(
+            deepcopy(fx_graph), require_full_compilation=True
+        )
+        self.assertEquals(
+            len(
+                [
+                    1
+                    for submod in list(partitioned_graph.named_children())
+                    if "_run_on_acc" in submod[0]
+                ]
+            ),
+            1,
+            "Single operators can be segmented if full compilation is required",
+        )
+
+    def test_partition_fully_supported_one_op(self):
+        class FullySupportedOneOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                return torch.ops.aten.add.Tensor(x, y)
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedOneOp())
+        partitioned_graph = partitioning.fast_partition(deepcopy(fx_graph))
+        self.assertEquals(
+            len(
+                [
+                    1
+                    for submod in list(partitioned_graph.named_children())
+                    if "_run_on_acc" in submod[0]
+                ]
+            ),
+            0,
+            "Single operators should not be segmented",
+        )
+
     def test_partition_fully_supported_multi_op(self):
         class FullySupportedMultiOp(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None:
diff --git a/tests/py/dynamo/partitioning/test_global_partitioning.py b/tests/py/dynamo/partitioning/test_global_partitioning.py
@@ -25,6 +25,24 @@ def forward(self, x, y):
             "Single operators should not be segmented",
         )
 
+    def test_partition_fully_supported_one_op_require_full_compilation(self):
+        class FullySupportedOneOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                return torch.ops.aten.add.Tensor(x, y)
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedOneOp())
+        partitioned_graph = partitioning.global_partition(
+            deepcopy(fx_graph), require_full_compilation=True
+        )
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            1,
+            "Single operators can be segmented if full compilation is required",
+        )
+
     def test_partition_fully_supported_multi_op(self):
         class FullySupportedMultiOp(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None: