Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions src/ATen/native/xpu/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,14 @@ void memcpyAsync(
bool p2p_enabled) {
Device dst_device = iter.device(0);
Device src_device = iter.device(1);
if (dst_device == src_device) {
copy_kernel(iter);
} else {
if (dst_device != src_device) {
TORCH_INTERNAL_ASSERT(p2p_enabled == true);
auto dst = (char*)iter.data_ptr(0);
auto src = (char*)iter.data_ptr(1);
size_t size = iter.numel() * iter.element_size(0);
auto q = copy_stream.queue();
q.copy(src, dst, size);
}
auto dst = (char*)iter.data_ptr(0);
auto src = (char*)iter.data_ptr(1);
size_t size = iter.numel() * iter.element_size(0);
auto q = copy_stream.queue();
q.copy(src, dst, size);
Comment on lines +73 to +80
Copy link
Preview

Copilot AI Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function now always calls q.copy() regardless of whether devices are the same or different. For same-device copies, this may not be the intended behavior since the original code used copy_kernel(iter) for same-device operations. Consider verifying that q.copy() handles same-device copies correctly or add a comment explaining why this unified approach is safe.

Suggested change
if (dst_device != src_device) {
TORCH_INTERNAL_ASSERT(p2p_enabled == true);
auto dst = (char*)iter.data_ptr(0);
auto src = (char*)iter.data_ptr(1);
size_t size = iter.numel() * iter.element_size(0);
auto q = copy_stream.queue();
q.copy(src, dst, size);
}
auto dst = (char*)iter.data_ptr(0);
auto src = (char*)iter.data_ptr(1);
size_t size = iter.numel() * iter.element_size(0);
auto q = copy_stream.queue();
q.copy(src, dst, size);
auto dst = (char*)iter.data_ptr(0);
auto src = (char*)iter.data_ptr(1);
size_t size = iter.numel() * iter.element_size(0);
auto q = copy_stream.queue();
if (dst_device == src_device) {
// Use the optimized kernel for same-device copies
copy_kernel(iter);
} else {
TORCH_INTERNAL_ASSERT(p2p_enabled == true);
q.copy(src, dst, size);
}

Copilot uses AI. Check for mistakes.

}

void copy_device_to_device(
Expand Down Expand Up @@ -124,9 +122,6 @@ void copy_device_to_device(
}

if (memcpy_eligible) {
// SYCL queue.memcpy performance is worse than SYCL copy kernel
// implementation. JIRA:
// https://jira.devtools.intel.com/browse/CMPLRLLVM-41292
memcpyAsync(iter, copy_stream, p2p_enabled);
} else {
if (same_neg) {
Expand Down
Loading