pytorch
diff --git a/‎CODEOWNERS‎
Lines changed: 8 additions & 8 deletions b/‎CODEOWNERS‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 66 additions & 0 deletions b/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 52 additions & 0 deletions b/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/vulkan/op_registry.py‎
Lines changed: 13 additions & 1 deletion b/‎backends/vulkan/op_registry.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl‎
Lines changed: 19 additions & 7 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl‎
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Slice.cpp‎
Lines changed: 25 additions & 14 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Slice.cpp‎
Lines changed: 25 additions & 14 deletions
diff --git a/‎backends/vulkan/test/op_tests/cases.py‎
Lines changed: 5 additions & 1 deletion b/‎backends/vulkan/test/op_tests/cases.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎build/cmake_deps.toml‎
Lines changed: 16 additions & 0 deletions b/‎build/cmake_deps.toml‎
Lines changed: 16 additions & 0 deletions
@@ -15,7 +15,7 @@
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
 
-/build @GregoryComer @dbort @kirklandsign
+/build @GregoryComer @kirklandsign
 
 /codegen @larryliu0820 @lucylq
 
@@ -47,32 +47,32 @@
 /extension/apple @shoumikhin
 /extension/aten_util @JacobSzwejbka
 /extension/benchmark @tarun292
-/extension/data_loader @JacobSzwejbka @lucylq @dbort
-/extension/evalue_util @GregoryComer @dbort
+/extension/data_loader @JacobSzwejbka @lucylq
+/extension/evalue_util @GregoryComer
 /extension/export_util @kimishpatel
 /extension/flat_tensor @lucylq
 /extension/gguf_util @larryliu0820
 /extension/kernel_util @kimishpatel @manuelcandales
 /extension/llm @jackzhxng @iseeyuan @larryliu0820
-/extension/memory_allocator @JacobSzwejbka @dbort
+/extension/memory_allocator @JacobSzwejbka
 /extension/module @shoumikhin
 /extension/parallel @kimishpatel
 /extension/pybindings @JacobSzwejbka @larryliu0820
 /extension/pytree @JacobSzwejbka
-/extension/runner_util @dbort
+# /extension/runner_util @dbort
 /extension/tensor @shoumikhin
-/extension/testing_util @dbort
+# /extension/testing_util @dbort
 /extension/threadpool @kimishpatel
 /extension/training @JacobSzwejbka
 
 /kernels @manuelcandales
 
 /profiler @tarun292 @Gasoonjia
 
-/runtime @dbort @JacobSzwejbka @lucylq
+/runtime @JacobSzwejbka @lucylq
 /runtime/backend @cccclai
 
-/schema @dbort @JacobSzwejbka @lucylq
+/schema @JacobSzwejbka @lucylq
 
 /scripts @GregoryComer
 
 
@@ -807,6 +807,72 @@ def remove_branched(
                 user.replace_all_uses_with(node.args[0])
 
 
+class RemoveCatFromSliceCopyPass(ExportPass):
+    def _remove_unused_cat(self, graph_module: torch.fx.GraphModule) -> None:
+        slice_copy_nodes = [
+            node
+            for node in graph_module.graph.nodes
+            if node.target == exir_ops.edge.aten.slice_copy.Tensor
+        ]
+        for slice_copy_node in slice_copy_nodes:
+            slice_dim, start_idx, end_idx, step = 0, 0, float("inf"), 1
+            input_node, *other_args = slice_copy_node.args
+            if len(other_args) >= 1:
+                slice_dim = other_args[0]
+            if len(other_args) >= 2:
+                start_idx = other_args[1]
+            if len(other_args) >= 3:
+                end_idx = other_args[2]
+            if len(other_args) >= 4:
+                step = other_args[3]
+            if step != 1:
+                continue
+            slice_copy_dtype = slice_copy_node.meta["val"].dtype
+            if input_node.target != exir_ops.edge.aten.cat.default:
+                continue
+            cat_dtype = input_node.meta["val"].dtype
+            if slice_copy_dtype != cat_dtype:
+                continue
+            cat_dim = input_node.args[1:]
+            if len(cat_dim) == 0:
+                cat_dim = 0
+            if cat_dim != slice_dim:
+                continue
+            cat_output_shape = input_node.meta["val"].shape
+            start_idx = (
+                cat_output_shape[cat_dim] + start_idx if start_idx < 0 else start_idx
+            )
+            end_idx = (
+                cat_output_shape[cat_dim]
+                if end_idx > cat_output_shape[cat_dim]
+                else end_idx
+            )
+            base_idx = 0
+            cat_input_to_keep = None
+            for cat_input_node in input_node.args[0]:
+                cat_input_dtype = cat_input_node.meta["val"].dtype
+                if slice_copy_dtype != cat_input_dtype:
+                    continue
+                cat_input_shape = cat_input_node.meta["val"].shape
+
+                # check if the slice range overlaps with the cat range
+                if (
+                    base_idx <= start_idx
+                    and end_idx <= list(cat_input_shape)[cat_dim] + base_idx
+                ):
+                    cat_input_to_keep = cat_input_node
+                    break
+                base_idx += list(cat_input_shape)[cat_dim]
+            if cat_input_to_keep is not None:
+                slice_copy_node.replace_input_with(input_node, cat_input_to_keep)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._remove_unused_cat(graph_module)
+        graph_module.recompile()
+        graph_module.graph.eliminate_dead_code()
+        return super().call(graph_module)
+
+
 # The following class consolidates functions to remove ops that are redundant
 # in Jarvis. Currently, each function in this class iterates over each node of
 # the graph module once. In future, we could consolidate them into a monolithic
 
@@ -22,6 +22,7 @@
 from executorch.backends.cadence.aot.remove_ops import (
     RemoveAliasCopyOpPass,
     RemoveBranchedQuantDequant,
+    RemoveCatFromSliceCopyPass,
     RemoveCloneOpPass,
     RemoveContiguousOpPass,
     RemoveDetachCopyPass,
@@ -741,3 +742,54 @@ def forward(self, x):
                 },
             )
         )
+
+    def test_remove_cat_from_slice_copy_all_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=1)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
+
+    def test_remove_cat_from_slice_copy_no_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=3)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 1)
+
+    def test_remove_cat_from_slice_copy_zero_range(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=0)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
@@ -530,7 +530,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
@@ -557,6 +556,19 @@ def register_ported_op(features: OpFeatures):
     return features
 
 
+@update_features(
+    [
+        # Indexing and lookup
+        exir_ops.edge.aten.slice_copy.Tensor,
+    ]
+)
+def register_ported_op_all_packed_dims(features: OpFeatures):
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims=all_packed_dims,
+    )
+    return features
+
+
 # Ported ops that support their own prepacking.
 @update_features(
     [
 
@@ -27,8 +27,7 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
   int dim;
   int offset;
   int step;
-  // Used when dim=batch. Stride is the # of plances for each batch  value.
-  int stride;
+  int image_in_channel_size;
 }
 slice_arg;
 
@@ -45,11 +44,24 @@ void main() {
 
   ivec3 in_pos = pos;
 
-  int index = pos[slice_arg.dim] / slice_arg.stride;
-  int within_stride = pos[slice_arg.dim] % slice_arg.stride;
-
-  in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
-    slice_arg.stride + within_stride;
+  // slice along batch axis
+  if (slice_arg.dim == 3) {
+    // index of the channel inside a batch
+    const int chanl_index = pos.z % slice_arg.image_in_channel_size;
+    // index of batch
+    const int batch_index = pos.z / slice_arg.image_in_channel_size;
+    in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index;
+  } else if (slice_arg.dim == C_DIM) {
+    // index of the channel inside a batch
+    const int chanl_index = pos.z % sizes.z;
+    // index of batch
+    const int batch_index = pos.z / sizes.z;
+    in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step;
+  } else if (slice_arg.dim == H_DIM) {
+    in_pos.y = slice_arg.offset + pos.y * slice_arg.step;
+  } else {
+    in_pos.x = slice_arg.offset + pos.x * slice_arg.step;
+  }
 
   imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));
 
 
@@ -49,10 +49,10 @@ void main() {
   for (int i=0;i<4;i++) {
       ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
 
-      int in_channel = user_coor.z;
+      int in_dim = user_coor[packed_dim];
 
       ivec4 in_user_coor = user_coor;
-      in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;
+      in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step;
 
       ivec4 in_pow_elem = to_texture_elem_pos(
         in_user_coor,
 
@@ -44,8 +44,7 @@ void add_slice_tensor_copy_node(
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
 
   // Need normalize the dim
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -76,7 +75,13 @@ void add_slice_tensor_copy_node(
   start = normalize_idx(start, in_sizes[dim], 0);
   end = normalize_idx(end, in_sizes[dim], in_sizes[dim]);
 
-  if (dim_index == kChannel4D) {
+  const vkapi::SpecVarList spec_vars = {t_in->packed_dim()};
+
+  const auto packed_dim_idx =
+      static_cast<DimIndex>(DimIndex::DIM_LAST - t_in->packed_dim());
+
+  // if slice dim is the same as the packed dim, we can use the channel slice
+  if (dim_index == packed_dim_idx) {
     // slice by channel
     std::string kernel_name = "slice_channel";
     kernel_name.reserve(kShaderNameReserve);
@@ -99,26 +104,31 @@ void add_slice_tensor_copy_node(
          {in, vkapi::MemoryAccessType::READ}},
         {t_out->sizes_ubo(),
          t_in->sizes_ubo(),
-         graph.create_params_buffer(params)}));
+         graph.create_params_buffer(params)},
+        spec_vars));
 
   } else {
     // GPU's coordinate is in x, y, z
     int64_t gpu_dim = -1;
-    int64_t stride = 1;
+    int64_t in_channel_stride = 1;
     if (dim_index == kWidth4D) {
       gpu_dim = 0; // width: x dimension in gpu
       VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
     } else if (dim_index == kHeight4D) {
       gpu_dim = 1; // height: y dimension
       VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-    } else if (dim_index == kBatch4D) {
-      gpu_dim = 2; // batch: z dimension
-
-      // Due to channel packing, each batch value is span over stride planes
-      int64_t n_channels = dim_at(in_sizes, kChannel4D);
-      stride = utils::div_up_4(n_channels);
+    } else if (dim_index == kChannel4D) {
+      gpu_dim = 2; // channel: z dimension
+      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+      in_channel_stride = dim_at(in_sizes, kChannel4D);
     } else {
-      VK_THROW("Unexpected ncwh_dim!");
+      gpu_dim = 3; // batch: w dimension
+
+      in_channel_stride = dim_at(in_sizes, kChannel4D);
+      if (packed_dim_idx == kChannel4D) {
+        // Due to channel packing, each batch value is span over stride planes
+        in_channel_stride = utils::div_up_4(in_channel_stride);
+      }
     }
 
     std::string kernel_name = "slice_batch_height_width";
@@ -137,7 +147,7 @@ void add_slice_tensor_copy_node(
         static_cast<int32_t>(gpu_dim),
         static_cast<int32_t>(start),
         static_cast<int32_t>(step),
-        static_cast<int32_t>(stride),
+        static_cast<int32_t>(in_channel_stride),
     };
 
     graph.execute_nodes().emplace_back(new DispatchNode(
@@ -147,7 +157,8 @@ void add_slice_tensor_copy_node(
         local_size,
         {{out, vkapi::MemoryAccessType::WRITE},
          {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(), graph.create_params_buffer(params)}));
+        {t_out->sizes_ubo(), graph.create_params_buffer(params)},
+        spec_vars));
   }
 }
 
 
@@ -585,7 +585,11 @@ def get_slice_out_inputs():
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.dtypes = ["at::kFloat", "at::kHalf"]
-    test_suite.layouts = ["utils::kChannelsPacked"]
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     test_suite.data_gen = "make_seq_tensor"
     return test_suite
 
 
@@ -58,6 +58,21 @@ deps = [
   "executorch_core",
 ]
 
+# HACK: prevent reduce_util from also showing up in custom_ops. The
+# actual medium-term fix is to stop using Buck to drive our CMake
+# builds.
+[targets.reduce_util]
+buck_targets = [
+  "//kernels/portable/cpu/util:reduce_util",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+]
+
 [targets.optimized_kernels]
 buck_targets = [
   "//kernels/optimized:generated_lib",
@@ -414,6 +429,7 @@ deps = [
   "optimized_kernels",
   "extension_parallel",
   "extension_threadpool",
+  "reduce_util",
   "xnnpack_backend",
 ]