diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 5c9ed2ef9c..cd396afc3f 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -1,10 +1,16 @@ import math from typing import Optional +import numpy as np +import tensorrt as trt from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR +from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim +from torch_tensorrt.dynamo.conversion.converter_utils import ( + get_positive_dim, + get_trt_tensor, +) from torch_tensorrt.dynamo.conversion.impl.slice.base import slice from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, @@ -96,3 +102,98 @@ def expand( layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) set_layer_name(layer, target, name, source_ir) return layer.get_output(0) + + +def chunk( + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR], + name: str, + input: TRTTensor, + chunks: int, + dim: int, +) -> TRTTensor: + if chunks <= 0: + raise RuntimeError( + f"chunk expects `chunks` to be greater than 0, got: {chunks}" + ) + + shape = input.shape + dim = get_positive_dim(dim, len(shape)) + + if dim >= len(shape): + raise RuntimeError( + f"chunk expects `dim` to be less than the length of input shape, got: {dim}" + ) + + dynamic_shape = has_dynamic_shape(input.shape) + if dynamic_shape > 0: + # Check whether slice target dim is dynamic shape dim + assert input.shape[dim] != -1, "Can't chunk on dynamic shape dimension!" + + size_dim = shape[dim] + chunk_size = math.ceil(size_dim / chunks) + result = [] + start = 0 + end = min(start + chunk_size, size_dim) + cnt = 0 + + while start < end: + result.append( + slice_op( + ctx, + target, + source_ir, + f"{name}_slice_{cnt}", + input, + dim, + start, + end, + 1, + ) + ) + start = end + end = min(start + chunk_size, size_dim) + cnt += 1 + + return result + + +def cumsum( + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR], + name: str, + input: TRTTensor, + dim: int, +) -> TRTTensor: + input_shape = input.shape + dim = get_positive_dim(dim, len(input_shape)) + loop = ctx.net.add_loop() + axis = np.array(input_shape[dim]) + trip_limit = get_trt_tensor(ctx, axis, f"{name}_trip_limit") + loop.add_trip_limit(trip_limit, trt.TripLimit.COUNT) + iterator = loop.add_iterator(input, dim, reverse=False) + data = iterator.get_output(0) + new_dims = tuple(data.shape) + zeros = np.zeros(new_dims) + zero_trttensor = get_trt_tensor(ctx, zeros, f"{name}_initial_value") + + running_sum = loop.add_recurrence(zero_trttensor) + set_layer_name(running_sum, target, f"{name}_running_sum", source_ir) + running_sum_tensor = running_sum.get_output(0) + + current_sum = impl.elementwise.add( + ctx, + target, + source_ir, + f"{name}_elementwise_add", + data, + running_sum_tensor, + ) + running_sum.set_input(1, current_sum) + + loop_output = loop.add_loop_output(current_sum, trt.LoopOutput.CONCATENATE, dim) + set_layer_name(loop_output, target, f"{name}_loop_output", source_ir) + loop_output.set_input(1, trip_limit) + return loop_output.get_output(0) diff --git a/tests/py/dynamo/conversion/test_cumsum_aten.py b/tests/py/dynamo/conversion/test_cumsum_aten.py new file mode 100644 index 0000000000..e6aaea56cb --- /dev/null +++ b/tests/py/dynamo/conversion/test_cumsum_aten.py @@ -0,0 +1,69 @@ +import torch +import torch.nn as nn +from parameterized import parameterized +from torch.testing._internal.common_utils import run_tests + +from .harness import DispatchTestCase + + +class TestCumsumConverter(DispatchTestCase): + @parameterized.expand( + [ + ((1,), 0), + ((2,), 0), + ((3,), -1), + ] + ) + def test_cumsum_1D(self, shape, dim): + class Cumsum(nn.Module): + def forward(self, x): + return torch.ops.aten.cumsum.default(x, dim) + + inputs = [torch.randn(shape)] + self.run_test( + Cumsum(), + inputs, + ) + + @parameterized.expand( + [ + ((3, 1), 0), + ((3, 1), 1), + ((2, 3), -1), + ((2, 3), -2), + ] + ) + def test_cumsum_2D(self, shape, dims): + class Cumsum(nn.Module): + def forward(self, x): + return torch.ops.aten.cumsum.default(x, dims) + + inputs = [torch.randn(shape)] + self.run_test( + Cumsum(), + inputs, + ) + + @parameterized.expand( + [ + ((4, 2, 3), 0), + ((4, 2, 3), 1), + ((1, 2, 3), 2), + ((1, 2, 3), -1), + ((1, 2, 3), -2), + ] + ) + def test_cumsum_3D(self, shape, dims): + class Cumsum(nn.Module): + def forward(self, x): + return torch.ops.aten.cumsum.default(x, dims) + + inputs = [torch.randn(shape)] + self.run_test( + Cumsum(), + inputs, + ) + + +if __name__ == "__main__": + run_tests() diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index 4a7cd24487..2770db3999 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -7,8 +7,8 @@ python hub.py batch_sizes=(1 2 4 8 16 32 64 128 256) large_model_batch_sizes=(1 2 4 8 16 32 64) -backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor") -backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor") +backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor" "tensorrt") +backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor" "tensorrt") # Benchmark VGG16 model diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 5a8ab1cea5..c52fb6ba56 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -293,29 +293,30 @@ def run_tensorrt( input_tensors, params, precision, - is_trt_engine=False, batch_size=1, ): - engine = None - - # If the model file is a TensorRT engine then directly deserialize and run inference - # else convert the torch module to a TensorRT engine first and then run inference - if not is_trt_engine: - compile_settings = { - "inputs": input_tensors, - "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": params.get("truncate", False), - } - - print("Converting method to TensorRT engine...") - with torch.no_grad(), torchtrt.logging.errors(): - model = torchtrt.ts.convert_method_to_trt_engine( - model, "forward", **compile_settings - ) - + # Export an ONNX model and convert to TRT + torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx") + logger = trt.Logger(trt.Logger.WARNING) + builder = trt.Builder(logger) + network = builder.create_network( + 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + ) + parser = trt.OnnxParser(network, logger) + success = parser.parse_from_file("./tmp.onnx") + if not success: + raise ValueError("ONNX conversion failed") + + config = builder.create_builder_config() + if precision == "fp16": + config.set_flag(trt.BuilderFlag.FP16) + start_compile = time.time_ns() + serialized_engine = builder.build_serialized_network(network, config) + end_compile = time.time_ns() + compile_time_s = (end_compile - start_compile) / 1e9 # Deserialize the TensorRT engine - with trt.Logger() as logger, trt.Runtime(logger) as runtime: - engine = runtime.deserialize_cuda_engine(model) + with trt.Runtime(logger) as runtime: + engine = runtime.deserialize_cuda_engine(serialized_engine) print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) iters = params.get("iterations", 20) @@ -350,7 +351,7 @@ def run_tensorrt( meas_time = end_time - start_time timings.append(meas_time) - recordStats("TensorRT", timings, precision, batch_size) + recordStats("TensorRT", timings, precision, batch_size, compile_time_s) # Deploys inference run for different backend configurations @@ -426,11 +427,10 @@ def run( ) elif backend == "tensorrt": run_tensorrt( - model, + model_torch, input_tensors, params, precision, - is_trt_engine, batch_size, ) elif backend == "dynamo": @@ -439,9 +439,6 @@ def run( elif backend == "torch_compile": run_torch_compile(model_torch, input_tensors, params, precision, batch_size) - elif backend == "torch_compile": - run_torch_compile(model_torch, input_tensors, params, precision, batch_size) - elif backend == "inductor": run_inductor(model_torch, input_tensors, params, precision, batch_size) diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt index d204d3c335..159e6f5eab 100644 --- a/tools/perf/requirements.txt +++ b/tools/perf/requirements.txt @@ -1,7 +1,9 @@ numpy argparse pyyaml +onnx transformers==4.33.2 diffusers==0.21.4 pandas==2.0.1 timm==0.9.8 +