lly-zero-one
diff --git a/‎benchmarks/tensorexpr/benchmark.py
Lines changed: 9 additions & 18 deletions b/‎benchmarks/tensorexpr/benchmark.py
Lines changed: 9 additions & 18 deletions
diff --git a/‎benchmarks/tensorexpr/broadcast.py
Lines changed: 4 additions & 142 deletions b/‎benchmarks/tensorexpr/broadcast.py
Lines changed: 4 additions & 142 deletions
diff --git a/‎benchmarks/tensorexpr/elementwise.py
Lines changed: 13 additions & 117 deletions b/‎benchmarks/tensorexpr/elementwise.py
Lines changed: 13 additions & 117 deletions
@@ -2,16 +2,15 @@
 import itertools
 import framework
 import os
-import types
 import tensor_engine
-#import normalization
+import normalization
 import broadcast
-#import reduction
+import reduction
 import elementwise
-#import softmax
-#import pooling
-#import conv
-#import matmul
+import softmax
+import pooling
+import conv
+import matmul
 
 
 def main():
@@ -32,15 +31,7 @@ def main():
                         help='the underlying tensor engine. only pt for now')
     parser.add_argument('--jit_mode', type=str, default='trace',
                         help='the jit mode to use: one of {trace, none}')
-    parser.add_argument('--cuda_pointwise_loop_levels', type=int, default=None,
-                        help='num of loop levesl for Cuda pointwise operations: 2 or 3')
-    parser.add_argument('--cuda_pointwise_block_count', type=int, default=None,
-                        help='num of block for Cuda pointwise operations')
-    parser.add_argument('--cuda_pointwise_block_size', type=int, default=None,
-                        help='num of blocks for Cuda pointwise operations')
-    parser.add_argument('--cuda_fuser', type=str, default='te',
-                        help='The Cuda fuser backend to use: one of {te, old, none}')
-
+    
     args = parser.parse_args()
 
     def set_global_threads(num_threads):
@@ -82,7 +73,7 @@ def run_default_configs(bench_cls, allow_skip=True):
                     continue
                 else:
                     raise ValueError('attempted to run an unsupported benchmark: %s' % (benchmark.desc()))
-            framework.run_benchmark(benchmark, args)
+            framework.run_benchmark(benchmark)
 
     benchmark_classes = framework.benchmark_classes
     if not args.benchmark_names:
@@ -125,7 +116,7 @@ def run_default_configs(bench_cls, allow_skip=True):
                             pass
                     benchmark = bench_cls(*config)
                     benchmark.jit_mode = args.jit_mode
-                    framework.run_benchmark(benchmark, args)
+                    framework.run_benchmark(benchmark)
 
             if not match_class_name:
                 available_classes = ', '.join([bench_cls.module() for bench_cls in benchmark_classes])
 
@@ -1,7 +1,4 @@
 import framework
-import itertools
-import numpy as np
-import torch
 
 
 class BroadcastMulBench(framework.Benchmark):
@@ -123,142 +120,7 @@ def module():
         return 'broadcast_3args'
 
 
-#framework.register_benchmark_class(BroadcastRowBench)
-#framework.register_benchmark_class(BroadcastMidBench)
-#framework.register_benchmark_class(BroadcastColBench)
-#framework.register_benchmark_class(BroadcastThreeArgs)
-
-# TODO: merge this with elementwise bench
-# A template class for elementwise operations.
-# A derived class will override the class instance to customize its behavior.
-class BroadcastBench(framework.Benchmark):
-    # List of customization class variables.
-    op_str = None
-    binary_op_pt_func = None
-    binary_op_np_func = None
-    unary_op_pt_func = None
-    unary_op_np_func = None
-    split_input = True
-    def __init__(self, mode, device, M, N, K):
-        super().__init__(mode, device)
-        self.M = M
-        self.N = N
-        self.K = K
-        self.d1 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
-        self.d2 = self.rand([K, 1, N], device=device, requires_grad=self.requires_grad)
-        self.d3 = self.rand([M, N], device=device, requires_grad=self.requires_grad)
-        self.d4 = self.rand([K, M, 1], device=device, requires_grad=self.requires_grad)
-        self.inputs = [self.d1, self.d2, self.d3, self.d4]
-
-    def _eval(self, d1, d2, d3, d4, binary_op, unary_op):
-        if not binary_op:
-            binary_op = lambda x, y: x + y
-        if not unary_op:
-            unary_op = lambda x: x
-        if self.split_input:
-            d1 = unary_op(d1)
-            d2 = unary_op(d2)
-            d3 = unary_op(d3)
-            d4 = unary_op(d4)
-        else:
-            d1, d2, d3, d4 = unary_op(d1), unary_op(d2), unary_op(d1 + 0.001), unary_op(d4)
-        a = binary_op(d1, d2)
-        b = binary_op(d3, d4)
-        c = a + b
-        return c
-        
-    def forward(self, d1, d2, d3, d4):
-        binary_op = self.__class__.binary_op_pt_func
-        unary_op = self.__class__.unary_op_pt_func
-        return self._eval(d1, d2, d3, d4, binary_op, unary_op)
-
-    def reference(self):
-        binary_op = self.__class__.binary_op_np_func
-        unary_op = self.__class__.unary_op_np_func
-        [d1, d2, d3, d4] = [self.numpy(d) for d in [self.d1, self.d2, self.d3, self.d4]]
-        return self._eval(d1, d2, d3, d4, binary_op, unary_op)
-
-    def config(self):
-        return [self.M, self.N, self.K]
-
-    @classmethod
-    def module(cls):
-        return 'broadcast_' + cls.op_str
-
-    def memory_workload(self):
-        input_count = len(self.inputs)
-        if self.mode == 'fwd':
-            if self.split_input:
-                sol_count = 1
-                algorithmic_count = 1
-            else:
-                sol_count = 1
-                algorithmic_count = 1
-        else:
-            if self.split_input:
-                sol_count = 1
-                algorithmic_count = input_count
-            else:
-                sol_count = 1
-                algorithmic_count = input_count
-
-        buffer_size = self.M * self.N * self.K * 4
-        return {'sol': buffer_size * sol_count, 'algorithmic': buffer_size * algorithmic_count}
-
-    @staticmethod
-    def default_configs():
-        return [[1 << 8, 1 << 7, 1 << 9]]
-
-
-def register_broadcast_ops():
-    binary_op_list = [
-        ["mul", lambda a, b: a * b],
-        ["add", lambda a, b: a + b],
-        ["sub", lambda a, b: a - b],
-        ["div", lambda a, b: a / (b + 1e-4)],
-        ["pow", lambda a, b: torch.pow(a, b), lambda a, b: np.power(a, b)],  # no fuson triggered
-        ["max", lambda a, b: torch.max(a, b), lambda a, b: np.maximum(a, b)],
-        ["min", lambda a, b: torch.min(a, b), lambda a, b: np.minimum(a, b)],
-    ]
-
-    unary_op_list = [
-        ["exp", lambda x: torch.exp(x), lambda x: np.exp(x)],
-        ["sin", lambda x: torch.sin(x), lambda x: np.sin(x)],
-        ["cos", lambda x: torch.cos(x), lambda x: np.cos(x)],
-    ]
-    
-    for split_input, binary_op in itertools.product([True, False], binary_op_list):
-        # Make a copy of BroadcastBench
-        if len(binary_op) == 2:
-            [op_str, op_pt_func] = binary_op
-            op_np_func = op_pt_func
-        elif len(binary_op) == 3:
-            [op_str, op_pt_func, op_np_func] = binary_op
-        split_str = 'split' if split_input else 'shared'
-        op_str = split_str + '_' + op_str
-        bm_cls = type('BroadcastBench_' + op_str, (BroadcastBench,), {})
-        bm_cls.op_str = op_str
-        bm_cls.binary_op_pt_func = op_pt_func
-        bm_cls.binary_op_np_func = op_np_func
-        bm_cls.split_input = split_input
-        framework.register_benchmark_class(bm_cls)
-                
-    for split_input, unary_op in itertools.product([True, False], unary_op_list):
-        # Make a copy of BroadcastBench
-        if len(unary_op) == 2:
-            [op_str, op_pt_func] = unary_op
-            op_np_func = op_pt_func
-        elif len(unary_op) == 3:
-            [op_str, op_pt_func, op_np_func] = unary_op
-        split_str = 'split' if split_input else 'shared'
-        op_str = split_str + '_' + op_str
-        bm_cls = type('BroadcastBench_' + op_str, (BroadcastBench,), {})
-        bm_cls.op_str = op_str
-        bm_cls.unary_op_pt_func = op_pt_func
-        bm_cls.unary_op_np_func = op_np_func
-        bm_cls.split_input = split_input
-        framework.register_benchmark_class(bm_cls)
-                
-    
-register_broadcast_ops()
-
+framework.register_benchmark_class(BroadcastRowBench)
+framework.register_benchmark_class(BroadcastMidBench)
+framework.register_benchmark_class(BroadcastColBench)
+framework.register_benchmark_class(BroadcastThreeArgs)
@@ -1,18 +1,7 @@
 import framework
-import itertools
-import numpy as np
-import torch
 
-# A template class for elementwise operations.
-# A derived class will override the class instance to customize its behavior.
-class ElementBench(framework.Benchmark):
-    # List of customization class variables.
-    op_str = None
-    binary_op_pt_func = None
-    binary_op_np_func = None
-    unary_op_pt_func = None
-    unary_op_np_func = None
-    split_input = True
+
+class ElementMulBench(framework.Benchmark):
     def __init__(self, mode, device, N):
         super().__init__(mode, device)
         self.N = N
@@ -21,68 +10,28 @@ def __init__(self, mode, device, N):
         self.d3 = self.rand([N], device=device, requires_grad=self.requires_grad)
         self.d4 = self.rand([N], device=device, requires_grad=self.requires_grad)
         self.inputs = [self.d1, self.d2, self.d3, self.d4]
-        self.deterministic = ('rand' not in self.op_str)
 
-    def _eval(self, d1, d2, d3, d4, binary_op, unary_op):
-        if not binary_op:
-            binary_op = lambda x, y: x + y
-        if not unary_op:
-            unary_op = lambda x: x
-        if self.split_input:
-            d1 = unary_op(d1)
-            d2 = unary_op(d2)
-            d3 = unary_op(d3)
-            d4 = unary_op(d4)
-        else:
-            d2 = unary_op(d1 + 0.001)
-            d3 = unary_op(d1 + 0.002)
-            d4 = unary_op(d1 + 0.003)
-            d1 = unary_op(d1)
-        a = binary_op(d1, d2)
-        b = binary_op(d3, d4)
-        c = a + b
-        return c
-        
     def forward(self, d1, d2, d3, d4):
-        binary_op = self.__class__.binary_op_pt_func
-        unary_op = self.__class__.unary_op_pt_func
-        return self._eval(d1, d2, d3, d4, binary_op, unary_op)
+        y = d1 * d2 + d3 * d4
+        return y
 
     def reference(self):
-        binary_op = self.__class__.binary_op_np_func
-        unary_op = self.__class__.unary_op_np_func
-        [d1, d2, d3, d4] = [self.numpy(d) for d in [self.d1, self.d2, self.d3, self.d4]]
-        return self._eval(d1, d2, d3, d4, binary_op, unary_op)
+        return self.numpy(self.d1) * self.numpy(self.d2) + self.numpy(self.d3) * self.numpy(self.d4)
 
     def config(self):
         return [self.N]
 
-    @classmethod
-    def module(cls):
-        return 'element_' + cls.op_str
+    @staticmethod
+    def module():
+        return 'element_mul'
 
     def memory_workload(self):
-        input_count = len(self.inputs)
         if self.mode == 'fwd':
-            if self.split_input:
-                sol_count = input_count + 1
-                algorithmic_count = input_count + 1
-            else:
-                sol_count = 1 + 1
-                algorithmic_count = 1 + 1
-            if 'rand' in self.op_str:
-                sol_count = 1
-                algorithmic_count = 1
+            sol_count = 4 + 1
+            algorithmic_count = 3 + 1
         else:
-            if self.split_input:
-                sol_count = (input_count + 1) + (1 + input_count)
-                algorithmic_count = (input_count + 1) + ((2 + 1) * input_count)
-            else:
-                sol_count = 1 + 1
-                algorithmic_count = 1 + 1
-            if 'rand' in self.op_str:
-                sol_count = 1
-                algorithmic_count = 1
+            sol_count = (4 + 1) + (1 + 4)
+            algorithmic_count = (4 + 1) + ((2 + 1) * 4)
 
         buffer_size = self.N * 4
         return {'sol': buffer_size * sol_count, 'algorithmic': buffer_size * algorithmic_count}
@@ -92,57 +41,4 @@ def default_configs():
         return [[1 << 27]]
 
 
-def register_element_ops():
-    binary_op_list = [
-        ["mul", lambda a, b: a * b],
-        ["add", lambda a, b: a + b],
-        ["sub", lambda a, b: a - b],
-        ["div", lambda a, b: a / (b + 1e-4)],
-        ["pow", lambda a, b: torch.pow(a, b), lambda a, b: np.power(a, b)],  # no fuson triggered
-        ["max", lambda a, b: torch.max(a, b), lambda a, b: np.maximum(a, b)],
-        ["min", lambda a, b: torch.min(a, b), lambda a, b: np.minimum(a, b)],
-    ]
-
-    unary_op_list = [
-        ["exp", lambda x: torch.exp(x), lambda x: np.exp(x)],
-        ["sin", lambda x: torch.sin(x), lambda x: np.sin(x)],
-        ["cos", lambda x: torch.cos(x), lambda x: np.cos(x)],
-        ["rand_like", lambda x: torch.rand_like(x), lambda x: np.random.rand(*x.shape)],
-    ]
-    
-    for split_input, binary_op in itertools.product([True, False], binary_op_list):
-        # Make a copy of ElementBench
-        if len(binary_op) == 2:
-            [op_str, op_pt_func] = binary_op
-            op_np_func = op_pt_func
-        elif len(binary_op) == 3:
-            [op_str, op_pt_func, op_np_func] = binary_op
-        split_str = 'split' if split_input else 'shared'
-        op_str = split_str + '_' + op_str
-        bm_cls = type('ElementBench_' + op_str, (ElementBench,), {})
-        bm_cls.op_str = op_str
-        bm_cls.binary_op_pt_func = op_pt_func
-        bm_cls.binary_op_np_func = op_np_func
-        bm_cls.split_input = split_input
-        framework.register_benchmark_class(bm_cls)
-                
-    for split_input, unary_op in itertools.product([True, False], unary_op_list):
-        # Make a copy of ElementBench
-        if len(unary_op) == 2:
-            [op_str, op_pt_func] = unary_op
-            op_np_func = op_pt_func
-        elif len(unary_op) == 3:
-            [op_str, op_pt_func, op_np_func] = unary_op
-        split_str = 'split' if split_input else 'shared'
-        op_str = split_str + '_' + op_str
-        bm_cls = type('ElementBench_' + op_str, (ElementBench,), {})
-        bm_cls.op_str = op_str
-        bm_cls.unary_op_pt_func = op_pt_func
-        bm_cls.unary_op_np_func = op_np_func
-        bm_cls.split_input = split_input
-        framework.register_benchmark_class(bm_cls)
-                
-    
-#framework.register_benchmark_class(ElementMulBench)
-register_element_ops()
-
+framework.register_benchmark_class(ElementMulBench)