Intel-tensorflow
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/BUILD
Lines changed: 20 additions & 2 deletions b/‎third_party/xla/xla/backends/gpu/runtime/BUILD
Lines changed: 20 additions & 2 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft.cc
Lines changed: 29 additions & 3 deletions b/‎third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft.cc
Lines changed: 29 additions & 3 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
Lines changed: 11 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
Lines changed: 11 additions & 0 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
Lines changed: 3 additions & 1 deletion b/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc
Lines changed: 75 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/runtime/select_k_thunk_test.cc
Lines changed: 75 additions & 0 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/thunk.proto
Lines changed: 5 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/runtime/thunk.proto
Lines changed: 5 additions & 0 deletions
diff --git a/‎third_party/xla/xla/debug_options_flags.cc
Lines changed: 7 additions & 0 deletions b/‎third_party/xla/xla/debug_options_flags.cc
Lines changed: 7 additions & 0 deletions
diff --git a/‎third_party/xla/xla/service/gpu/BUILD
Lines changed: 1 addition & 0 deletions b/‎third_party/xla/xla/service/gpu/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/xla/xla/service/gpu/gpu_compiler.cc
Lines changed: 3 additions & 4 deletions b/‎third_party/xla/xla/service/gpu/gpu_compiler.cc
Lines changed: 3 additions & 4 deletions
diff --git a/‎third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
Lines changed: 41 additions & 12 deletions b/‎third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
Lines changed: 41 additions & 12 deletions
@@ -890,7 +890,7 @@ cuda_library(
     ],
 )
 
-cuda_library(
+cc_library(
     name = "select_k_exec_stub",
     srcs = ["select_k_exec_stub.cc"],
     hdrs = ["select_k_exec.h"],
@@ -939,7 +939,6 @@ cc_library(
     name = "select_k_thunk",
     srcs = ["select_k_thunk.cc"],
     hdrs = ["select_k_thunk.h"],
-    tags = ["gpu"],
     deps = [
         ":thunk",
         ":thunk_proto_cc",
@@ -955,13 +954,32 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ] + if_cuda_is_configured(
         [":select_k_exec_raft"],
         no_cuda = [":select_k_exec_stub"],
     ),
 )
 
+xla_cc_test(
+    name = "select_k_thunk_test",
+    srcs = ["select_k_thunk_test.cc"],
+    deps = [
+        ":select_k_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "memset_thunk",
     srcs = ["memset_thunk.cc"],
 
@@ -82,6 +82,12 @@ class OwningScratchAllocator {
     return absl::NotFoundError("Pointer not found");
   }
 
+  se::DeviceMemoryAllocator* get_allocator() const { return allocator_; }
+
+  void set_allocator(se::DeviceMemoryAllocator* allocator) {
+    allocator_ = allocator;
+  }
+
  private:
   int device_ordinal_;
   se::DeviceMemoryAllocator* allocator_;
@@ -96,6 +102,14 @@ class XlaDeviceMemoryResource : public rmm::mr::device_memory_resource {
                           se::DeviceMemoryAllocator* allocator)
       : scratch_allocator_(device_ordinal, allocator) {}
 
+  se::DeviceMemoryAllocator* get_allocator() const {
+    return scratch_allocator_.get_allocator();
+  }
+
+  void set_allocator(se::DeviceMemoryAllocator* allocator) {
+    scratch_allocator_.set_allocator(allocator);
+  }
+
  protected:
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {
     auto mem = scratch_allocator_.AllocateBytes(bytes);
@@ -122,6 +136,8 @@ class XlaDeviceMemoryResource : public rmm::mr::device_memory_resource {
 // RAII wrapper for RAFT resources bound to a CUDA stream
 struct RaftStreamResource : public se::Stream::Resource {
   raft::resources res;
+  std::shared_ptr<XlaDeviceMemoryResource> xla_dev_mem_res;
+  ~RaftStreamResource() override = default;
 
   // Factory to create a RaftStreamResource tied to a CUDA stream.
   // Sets up `raft::resources` with a custom XlaDeviceMemoryResource
@@ -138,9 +154,10 @@ struct RaftStreamResource : public se::Stream::Resource {
       cudaStream_t cuda_stream) {
     // Assign our custom AllocatorForRaft for this device
     auto handle = std::make_unique<RaftStreamResource>();
-    raft::resource::set_workspace_resource(
-        handle->res,
-        std::make_shared<XlaDeviceMemoryResource>(device_ordinal, allocator));
+    handle->xla_dev_mem_res =
+        std::make_shared<XlaDeviceMemoryResource>(device_ordinal, allocator);
+    raft::resource::set_workspace_resource(handle->res,
+                                           handle->xla_dev_mem_res);
     // Set Cuda Stream
     raft::resource::set_cuda_stream(handle->res,
                                     rmm::cuda_stream_view{cuda_stream});
@@ -246,6 +263,8 @@ absl::Status select_k_exec(int device_ordinal,
   SelectAlgo algo = choose_select_k_algorithm<T>(batch, n, k);
   VLOG(3) << "select_k_exec_raft: "
           << "device_ordinal: " << device_ordinal << ", "
+          << "allocator: " << allocator << ", "
+          << "stream: " << stream << ", "
           << "data_in: " << data_in.opaque() << " (" << data_in.size() << "B)"
           << ", data_out: " << data_out.opaque() << " (" << data_out.size()
           << "B)"
@@ -268,6 +287,13 @@ absl::Status select_k_exec(int device_ordinal,
   TF_RET_CHECK(resContainer != nullptr)
       << "Failed to create or retrieve RaftStreamResource";
 
+  // resContainer is scoped to a single stream.
+  // Because a stream does not execute select_k_exec concurrently from multiple
+  // threads, it is safe to update the allocator without additional locking.
+  if (allocator != resContainer->xla_dev_mem_res->get_allocator()) {
+    resContainer->xla_dev_mem_res->set_allocator(allocator);
+  }
+
   try {
     // Wrap raw device pointers in RAFT matrix views
     auto input_view =
 
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "xla/backends/gpu/runtime/select_k_exec.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -99,4 +100,14 @@ absl::Status SelectKThunk::ExecuteOnStream(const ExecuteParams& params) {
                        primitive_util::LowercasePrimitiveTypeName(dtype_)));
   }
 }
+
+absl::StatusOr<ThunkProto> SelectKThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  SelectKThunkProto* select_k_thunk_proto = proto.mutable_select_k_thunk();
+  (void)select_k_thunk_proto;
+  // TODO(upwind): Add fields for SelectKThunkProto.
+  return proto;
+}
 }  // namespace xla::gpu
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_SELECT_K_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_SELECT_K_THUNK_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <string>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -61,6 +61,8 @@ class SelectKThunk : public Thunk {
     return args_;
   }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   std::uint32_t batch_size_;
   std::uint32_t num_elements_;
 
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/select_k_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(SelectKThunkTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc0(/*index=*/0, /*size=*/20, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc0, /*offset=*/0, /*size=*/20);
+
+  BufferAllocation alloc1(/*index=*/1, /*size=*/12, /*color=*/0);
+  BufferAllocation::Slice slice1(&alloc1, /*offset=*/0, /*size=*/12);
+
+  BufferAllocation alloc2(/*index=*/2, /*size=*/12, /*color=*/0);
+  BufferAllocation::Slice slice2(&alloc2, /*offset=*/0, /*size=*/12);
+
+  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {1, 5}), slice0);
+  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {1, 3}), slice1);
+  emitters::KernelArgument arg2(ShapeUtil::MakeShape(U32, {1, 3}), slice2);
+  arg0.set_written(false);
+  arg1.set_written(true);
+  arg2.set_written(true);
+
+  emitters::KernelArguments kernel_arguments({arg0, arg1, arg2});
+
+  auto c1 = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{.125f, 0.875f, .5f, .25f, 0.75f}}));
+  auto topKInst = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1, 5}), {c1.get()}, "__gpu$TopK");
+
+  SelectKThunk thunk(topKInst.get(), 1, 5, 3, F32, kernel_arguments);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  EXPECT_THAT(proto, EqualsProto(R"pb(
+                thunk_info { profile_annotation: "custom-call" }
+                select_k_thunk {}
+              )pb"));
+}
+
+}  // namespace
+}  // namespace xla::gpu
@@ -133,6 +133,10 @@ message MemzeroThunkProto {
   xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
 }
 
+message SelectKThunkProto {
+  // TODO(upwind): Add fields for SelectKThunkProto.
+}
+
 message ThunkProto {
   ThunkInfoProto thunk_info = 1;
 
@@ -155,6 +159,7 @@ message ThunkProto {
     HostExecuteDoneThunkProto host_execute_done_thunk = 17;
     DynamicSliceThunkProto dynamic_slice_thunk = 18;
     MemzeroThunkProto memzero_thunk = 19;
+    SelectKThunkProto select_k_thunk = 20;
   }
 }
 
 
@@ -451,6 +451,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_detect_unstable_reductions(
       DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE);
   opts.set_xla_gpu_experimental_scaled_dot_with_triton(false);
+  opts.set_xla_gpu_experimental_use_raft_select_k(false);
   return opts;
 }
 
@@ -2556,6 +2557,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 "that checks for unstable reductions in HLO computations. "
                 "Acceptable values are: 'none', 'log', and 'crash'. 'none' is "
                 "the default."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_use_raft_select_k",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_use_raft_select_k),
+      debug_options->xla_gpu_experimental_use_raft_select_k(),
+      "If true, use the raft::matrix::select_k implementation of TopK."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
 
@@ -407,6 +407,7 @@ cc_library(
         "//xla/backends/gpu/runtime:ragged_all_to_all_thunk",
         "//xla/backends/gpu/runtime:recv_thunk",
         "//xla/backends/gpu/runtime:replica_id_thunk",
+        "//xla/backends/gpu/runtime:select_k_thunk",
         "//xla/backends/gpu/runtime:send_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
 
@@ -724,6 +724,8 @@ absl::Status RunOptimizationPasses(
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
     absl::string_view platform_name) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
+  se::GpuComputeCapability gpu_version =
+      gpu_target_config.device_description.gpu_compute_capability();
 
   HloPassPipeline pipeline("optimization");
   AddHloVerifier(&pipeline, !debug_options.xla_ignore_channel_id());
@@ -738,7 +740,7 @@ absl::Status RunOptimizationPasses(
     pipeline.AddPass<WindowedEinsumHandler>();
   }
   pipeline.AddPass<TopKSplitter>();
-  pipeline.AddPass<TopkSpecializer>();
+  pipeline.AddPass<TopkSpecializer>(gpu_version);
   pipeline.AddPass<TopkDecomposer>();
 
   pipeline.AddPass<DotDimensionSorter>();
@@ -876,9 +878,6 @@ absl::Status RunOptimizationPasses(
   // Expand the sort op to support stable sorting if required.
   pipeline.AddPass<StableSortExpander>();
 
-  se::GpuComputeCapability gpu_version =
-      gpu_target_config.device_description.gpu_compute_capability();
-
   // Build simplification pipeline.  The passes in here are run to a fixed
   // point.
   [&, &pipeline =
 
@@ -110,6 +110,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/recv_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
+#include "xla/backends/gpu/runtime/select_k_thunk.h"
 #include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -1428,25 +1429,53 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
           : std::tuple<size_t, size_t, size_t>{
                 1, data_shape.dimensions(0), top_elements_shape.dimensions(0)};
 
-  auto wavefront_size =
-      ir_emitter_context_->gpu_device_info().threads_per_warp();
-
-  // Load TopK custom kernel.
-  TF_ASSIGN_OR_RETURN(
-      CustomKernel kernel,
-      kernel::topk::GetTopKKernel("topk", data_shape.element_type(), n, k,
-                                  batch_size, platform_name(), wavefront_size));
-
   // Prepare kernel arguments.
   TF_ASSIGN_OR_RETURN(auto kernel_arguments,
                       emitters::KernelArguments::Create(
                           ir_emitter_context_->buffer_assignment(),
                           GetDefaultBufferAlignment(), instr));
 
-  auto thunk = std::make_unique<CustomKernelThunk>(instr, std::move(kernel),
-                                                   kernel_arguments);
-  AddThunkToThunkSequence(std::move(thunk));
+  auto dtype = data_shape.element_type();
+  bool is_cuda = std::holds_alternative<stream_executor::CudaComputeCapability>(
+      ir_emitter_context_->gpu_compute_capability());
+  if (is_cuda && instr->GetModule()
+                     ->config()
+                     .debug_options()
+                     .xla_gpu_experimental_use_raft_select_k()) {
+    // The heuristic for deciding when to use TopK Custom Kernel versus
+    // Raft::matrix::select_k was developed as part of the initial research
+    // in b/409009349.
+    // CustomCall TopK requires k <= 16 and n >= 1024
+    bool use_raft_select_k = false;
+    if (dtype == PrimitiveType::F32) {
+      use_raft_select_k =
+          (n < 1024) || (n == 1024 && k > 12) || (n > 1024 && k >= 8);
+    } else if (dtype == PrimitiveType::BF16) {
+      use_raft_select_k = n < 1024 || k >= 8;
+    }
+
+    VLOG(3) << "EmitTopKCustomCall: dtype=" << dtype << ", n=" << n
+            << ", k=" << k << ", use_raft_select_k=" << use_raft_select_k;
+
+    if (use_raft_select_k) {
+      AddThunkToThunkSequence(std::make_unique<SelectKThunk>(
+          instr, batch_size, n, k, dtype, kernel_arguments));
+      return absl::OkStatus();
+    }
+  }
+
+  auto wavefront_size =
+      ir_emitter_context_->gpu_device_info().threads_per_warp();
+
+  TF_RET_CHECK(k <= 16) << "CustomCall TopK requires k <= 16";
+  // Load TopK custom kernel.
+  TF_ASSIGN_OR_RETURN(
+      CustomKernel kernel,
+      kernel::topk::GetTopKKernel("topk", dtype, n, k, batch_size,
+                                  platform_name(), wavefront_size));
 
+  AddThunkToThunkSequence(std::make_unique<CustomKernelThunk>(
+      instr, std::move(kernel), kernel_arguments));
   return absl::OkStatus();
 }
Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,10 @@ message MemzeroThunkProto {`
`133`	`133`	`xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;`
`134`	`134`	`}`
`135`	`135`
	`136`	`+message SelectKThunkProto {`
	`137`	`+ // TODO(upwind): Add fields for SelectKThunkProto.`
	`138`	`+}`
	`139`	`+`
`136`	`140`	`message ThunkProto {`
`137`	`141`	`ThunkInfoProto thunk_info = 1;`
`138`	`142`
`@@ -155,6 +159,7 @@ message ThunkProto {`
`155`	`159`	`HostExecuteDoneThunkProto host_execute_done_thunk = 17;`
`156`	`160`	`DynamicSliceThunkProto dynamic_slice_thunk = 18;`
`157`	`161`	`MemzeroThunkProto memzero_thunk = 19;`
	`162`	`+ SelectKThunkProto select_k_thunk = 20;`
`158`	`163`	`}`
`159`	`164`	`}`
`160`	`165`