feat: Add hardware compatibility option in Dynamo

gs-olive · gs-olive · commit 74e24a4d6554 · 2023-12-20T11:44:00.000-08:00
- Add support for hardware compatibility for Ampere and later
architectures
- Add necessary functions to support the modification throughout the
stack, including C++ and Python components
- Update ABI version to address new metadata format for TRT Engines
- Update engine serialization schema accordingly
- Add test cases to validate feature
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -32,24 +32,35 @@ TRTEngine::TRTEngine(
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
-    const std::vector<std::string>& _out_binding_names)
-    : TRTEngine("deserialized_trt", serialized_engine, cuda_device, _in_binding_names, _out_binding_names) {}
+    const std::vector<std::string>& _out_binding_names,
+    bool hardware_compatible)
+    : TRTEngine(
+          "deserialized_trt",
+          serialized_engine,
+          cuda_device,
+          _in_binding_names,
+          _out_binding_names,
+          hardware_compatible) {}
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
     : TRTEngine(
           serialized_info[NAME_IDX],
           serialized_info[ENGINE_IDX],
           RTDevice(serialized_info[DEVICE_IDX]),
           split(serialized_info[INPUT_BINDING_NAMES_IDX], BINDING_DELIM),
-          split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM)) {}
+          split(serialized_info[OUTPUT_BINDING_NAMES_IDX], BINDING_DELIM),
+          static_cast<bool>(std::stoi(serialized_info[HW_COMPATIBLE_IDX]))) {}
 
 TRTEngine::TRTEngine(
     const std::string& mod_name,
     const std::string& serialized_engine,
     const RTDevice& cuda_device,
     const std::vector<std::string>& _in_binding_names,
-    const std::vector<std::string>& _out_binding_names) {
-  auto most_compatible_device = get_most_compatible_device(cuda_device);
+    const std::vector<std::string>& _out_binding_names,
+    bool hardware_compatible) {
+  this->hardware_compatible = hardware_compatible;
+
+  auto most_compatible_device = get_most_compatible_device(cuda_device, RTDevice(), hardware_compatible);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
   multi_gpu_device_check();
@@ -232,6 +243,7 @@ std::string TRTEngine::to_str() const {
   }
   ss << "  }" << std::endl;
   ss << "  Device: " << device_info << std::endl;
+  ss << "  Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl;
   // clang-format on
   return ss.str();
 }
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -34,19 +34,23 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<std::string> in_binding_names = {}; // ITO: PYT IDX
   std::vector<std::string> out_binding_names = {}; // ITO: PYT IDX
 
+  bool hardware_compatible = false; // Whether the engine was compiled in hardware compatible mode
+
   ~TRTEngine();
   TRTEngine(
       const std::string& serialized_engine,
       const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
-      const std::vector<std::string>& out_binding_names);
+      const std::vector<std::string>& out_binding_names,
+      bool hardware_compatible = false);
   TRTEngine(std::vector<std::string> serialized_info);
   TRTEngine(
       const std::string& mod_name,
       const std::string& serialized_engine,
       const RTDevice& cuda_device,
       const std::vector<std::string>& in_binding_names,
-      const std::vector<std::string>& out_binding_names);
+      const std::vector<std::string>& out_binding_names,
+      bool hardware_compatible = false);
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
   static void verify_serialization_fmt(const std::vector<std::string>& serialized_info);
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -43,8 +43,8 @@ bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_devi
   return false;
 }
 
-RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device) {
-  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device);
+RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device, bool hardware_compatible) {
+  auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device, hardware_compatible);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
   // TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
@@ -59,7 +59,9 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
-  LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
+  LOG_DEBUG(
+      "Attempting to run engine (ID: " << compiled_engine->name
+                                       << "); Hardware Compatible: " << compiled_engine->hardware_compatible);
 
   if (compiled_engine->profile_execution) {
     std::stringstream ss;
@@ -89,7 +91,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
     if (is_switch_required(curr_device, compiled_engine->device_info)) {
       // Scan through available CUDA devices and set the CUDA device context correctly
-      RTDevice device = select_rt_device(compiled_engine->device_info, curr_device);
+      RTDevice device =
+          select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
       set_rt_device(device);
 
       // Target device is new device
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -101,6 +101,9 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
               serialize_info[ENGINE_IDX] = base64_encode(trt_engine);
               serialize_info[INPUT_BINDING_NAMES_IDX] = serialize_bindings(self->in_binding_names);
               serialize_info[OUTPUT_BINDING_NAMES_IDX] = serialize_bindings(self->out_binding_names);
+              serialize_info[HW_COMPATIBLE_IDX] = self->hardware_compatible ? "1" : "0";
+
+              LOG_DEBUG("Serialized Hardware Compatibility: " << (self->hardware_compatible ? "Enabled" : "Disabled"));
 
               return serialize_info;
             },
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
@@ -9,9 +9,12 @@ namespace runtime {
 
 bool MULTI_DEVICE_SAFE_MODE = false;
 
-c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
+c10::optional<RTDevice> get_most_compatible_device(
+    const RTDevice& target_device,
+    const RTDevice& curr_device,
+    bool hardware_compatible) {
   LOG_DEBUG("Target Device: " << target_device);
-  auto device_options = find_compatible_devices(target_device);
+  auto device_options = find_compatible_devices(target_device, hardware_compatible);
   RTDevice current_device;
   if (current_device.id == -1) {
     current_device = get_current_device();
@@ -30,7 +33,8 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   dev_list << "[" << std::endl;
   for (auto device : device_options) {
     dev_list << "    " << device << ',' << std::endl;
-    if (device.device_name == target_device.device_name) {
+    // If the model is hardware compatible, any compatible device should be valid
+    if ((device.device_name == target_device.device_name) || hardware_compatible) {
       // First priority is selecting a candidate which agrees with the current device ID
       // If such a device is found, we can select it and break out of the loop
       if (device.id == current_device.id) {
@@ -60,7 +64,7 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
   }
 }
 
-std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device, bool hardware_compatible) {
   auto dla_supported = get_dla_supported_SMs();
   auto device_list = get_available_device_list().get_devices();
 
@@ -76,7 +80,8 @@ std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device) {
     } else if (target_device.device_type == nvinfer1::DeviceType::kGPU) {
       auto target_dev_cc = target_device.getSMCapability();
       // If the SM Capabilities match, should be good enough to run
-      if (poss_dev_cc == target_dev_cc) {
+      // If hardware compatibility mode is enabled and the SM is at least 80, device is valid
+      if ((poss_dev_cc == target_dev_cc) || (hardware_compatible && std::stoi(poss_dev_cc) >= 8)) {
         compatible_devices.push_back(device.second);
       }
     } else {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -15,7 +15,7 @@ namespace core {
 namespace runtime {
 
 using EngineID = int64_t;
-const std::string ABI_VERSION = "4";
+const std::string ABI_VERSION = "5";
 extern bool MULTI_DEVICE_SAFE_MODE;
 typedef enum {
   ABI_TARGET_IDX = 0,
@@ -24,13 +24,15 @@ typedef enum {
   ENGINE_IDX,
   INPUT_BINDING_NAMES_IDX,
   OUTPUT_BINDING_NAMES_IDX,
+  HW_COMPATIBLE_IDX,
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
 c10::optional<RTDevice> get_most_compatible_device(
     const RTDevice& target_device,
-    const RTDevice& curr_device = RTDevice());
-std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
+    const RTDevice& curr_device = RTDevice(),
+    bool hardware_compatible = false);
+std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device, bool hardware_compatible);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -5,7 +5,6 @@
 from typing import Any, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
-import torch_tensorrt
 from torch.export import ExportedProgram
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import (  # TODO: Should probabably be the TRT EngineCapability Enum
@@ -22,6 +21,7 @@
     DLA_SRAM_SIZE,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENGINE_CAPABILITY,
+    HARDWARE_COMPATIBLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
@@ -51,6 +51,8 @@
     to_torch_tensorrt_device,
 )
 
+import torch_tensorrt
+
 logger = logging.getLogger(__name__)
 
 
@@ -84,6 +86,7 @@ def compile(
     use_python_runtime: bool = USE_PYTHON_RUNTIME,
     use_fast_partitioner: bool = USE_FAST_PARTITIONER,
     enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    hardware_compatible: bool = HARDWARE_COMPATIBLE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile a TorchScript module for NVIDIA GPUs using TensorRT
@@ -140,6 +143,7 @@ def compile(
         use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization
         use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optiminal. Use the global paritioner (``False``) if looking for best performance
         enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the grap easier to covert to TensorRT, potentially increasing the amount of graphs run in TensorRT.
+        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -215,6 +219,7 @@ def compile(
         "dla_sram_size": dla_sram_size,
         "dla_local_dram_size": dla_local_dram_size,
         "dla_global_dram_size": dla_global_dram_size,
+        "hardware_compatible": hardware_compatible,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -24,6 +24,7 @@
 ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False
 REFIT = False
 REQUIRE_FULL_COMPILATION = False
+HARDWARE_COMPATIBLE = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -12,6 +12,7 @@
     DLA_SRAM_SIZE,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENGINE_CAPABILITY,
+    HARDWARE_COMPATIBLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
@@ -63,6 +64,7 @@ class CompilationSettings:
         dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
         dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
         dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
+        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
     """
 
     precision: torch.dtype = PRECISION
@@ -88,3 +90,4 @@ class CompilationSettings:
     dla_sram_size: int = DLA_SRAM_SIZE
     dla_local_dram_size: int = DLA_LOCAL_DRAM_SIZE
     dla_global_dram_size: int = DLA_GLOBAL_DRAM_SIZE
+    hardware_compatible: bool = HARDWARE_COMPATIBLE
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -188,6 +188,11 @@ def run(
             if self.compilation_settings.version_compatible:
                 _LOGGER.info("Using version compatible")
                 builder_config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
+            if self.compilation_settings.hardware_compatible:
+                _LOGGER.info("Using hardware compatible")
+                builder_config.hardware_compatibility_level = (
+                    trt.HardwareCompatibilityLevel.AMPERE_PLUS
+                )
             if self.compilation_settings.optimization_level is not None:
                 _LOGGER.info(
                     f"Using optimization level {self.compilation_settings.optimization_level}"
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -76,4 +76,5 @@ def convert_module(
             input_binding_names=list(interpreter_result.input_names),
             output_binding_names=list(interpreter_result.output_names),
             target_device=settings.device,
+            hardware_compatible=settings.hardware_compatible,
         )
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -9,10 +9,10 @@
 logger = logging.getLogger(__name__)
 
 SerializedTensorRTEngineFmt = Tuple[
-    str, str, bytes, str, str
+    str, str, str, bytes, str, str, str
 ]  # Defined in //core/runtime/register_jit_hooks.cpp
 SerializedTorchTensorRTModuleFmt = Tuple[
-    str, SerializedTensorRTEngineFmt, List[str], List[str]
+    str, Optional[SerializedTensorRTEngineFmt], List[str], List[str]
 ]
 
 
@@ -43,6 +43,7 @@ def __init__(
         input_binding_names: Optional[List[str]] = None,
         output_binding_names: Optional[List[str]] = None,
         target_device: Device = Device._current_device(),
+        hardware_compatible: bool = False,
     ):
         """__init__ method for torch_tensorrt.dynamo.runtime._TorchTensorRTModule.TorchTensorRTModule
 
@@ -89,6 +90,7 @@ def __init__(
             output_binding_names if output_binding_names is not None else []
         )
         self.name = name
+        self.hardware_compatible = hardware_compatible
 
         if serialized_engine is not None:
             self.engine = torch.classes.tensorrt.Engine(
@@ -99,6 +101,7 @@ def __init__(
                     serialized_engine,
                     TorchTensorRTModule._pack_binding_names(self.input_binding_names),
                     TorchTensorRTModule._pack_binding_names(self.output_binding_names),
+                    str(int(hardware_compatible)),
                 ]
             )
         else:
@@ -115,7 +118,7 @@ def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt:
     def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
         self.name = state[0]
         if state[1] is not None:
-            serialized_engine_info = state[1][0]
+            serialized_engine_info: SerializedTensorRTEngineFmt = state[1]
             import base64
 
             serialized_engine = base64.b64decode(serialized_engine_info[3])
@@ -127,13 +130,17 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
                     serialized_engine,
                     serialized_engine_info[4],
                     serialized_engine_info[5],
+                    serialized_engine_info[6],
                 ]
             )
         else:
             self.engine = None
 
         self.input_binding_names = state[2]
         self.output_binding_names = state[3]
+        self.hardware_compatible = (
+            bool(int(state[1][6])) if state[1] is not None else False
+        )
 
     def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         """Implementation of the forward pass for a TensorRT engine
diff --git a/tests/py/dynamo/runtime/test_hw_compat.py b/tests/py/dynamo/runtime/test_hw_compat.py
diff --git a/tests/py/ts/models/hw_compat.ts b/tests/py/ts/models/hw_compat.ts