@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
1196
1196
// Why aclrtSynchronizeDevice?
1197
1197
1198
1198
// Only check env once.
1199
- static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1199
+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" on " ));
1200
1200
if (!need_transform (tensor->type )) {
1201
1201
ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
1202
1202
ACL_MEMCPY_HOST_TO_DEVICE));
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
1279
1279
ACL_MEMCPY_DEVICE_TO_DEVICE));
1280
1280
return true ;
1281
1281
} else {
1282
+ #ifdef ASCEND_310P
1283
+ // TODO: Support 310p P2P copy
1284
+ return false ;
1285
+ #endif
1282
1286
// Different device but can access by peer.
1283
1287
int32_t canAccessPeer = 0 ;
1284
1288
ACL_CHECK (aclrtDeviceCanAccessPeer (&canAccessPeer, src_ctx->device ,
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1439
1443
int64_t ne0 = tensor->ne [0 ];
1440
1444
1441
1445
// Only check env once.
1442
- static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1446
+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" on " ));
1443
1447
1444
1448
// last line must bigger than 32, because every single op deal at
1445
1449
// least 32 bytes.
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
2000
2004
GGML_ASSERT (ggml_backend_is_cann (backend_src) ||
2001
2005
ggml_backend_is_cann (backend_dst));
2002
2006
2007
+ GGML_ASSERT (!is_matmul_weight ((const ggml_tensor*)src));
2008
+
2003
2009
if (!ggml_backend_buffer_is_cann (src->buffer ) ||
2004
2010
!ggml_backend_buffer_is_cann (dst->buffer )) {
2005
2011
return false ;
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
2020
2026
return true ;
2021
2027
}
2022
2028
if (backend_src != backend_dst) {
2029
+ #ifdef ASCEND_310P
2030
+ // TODO: Support 310p P2P copy
2031
+ return false ;
2032
+ #endif
2023
2033
ggml_backend_cann_buffer_context* buf_ctx_src =
2024
2034
(ggml_backend_cann_buffer_context*)buf_src->context ;
2025
2035
ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
2036
2046
}
2037
2047
2038
2048
// need open both directions for memcpyasync between devices.
2039
- ggml_cann_set_device (cann_ctx_dst->device );
2040
2049
ACL_CHECK (aclrtDeviceEnablePeerAccess (cann_ctx_src->device , 0 ));
2041
2050
ggml_cann_set_device (cann_ctx_src->device );
2042
2051
ACL_CHECK (aclrtDeviceEnablePeerAccess (cann_ctx_dst->device , 0 ));
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
2047
2056
ACL_MEMCPY_DEVICE_TO_DEVICE,
2048
2057
cann_ctx_src->stream ()));
2049
2058
2050
- // TODO: workaround for Event didn`t work here.
2051
- aclrtSynchronizeStream (cann_ctx_src->stream ());
2059
+ // record event on src stream after the copy
2060
+ if (!cann_ctx_src->copy_event ) {
2061
+ ACL_CHECK (aclrtCreateEventWithFlag (&cann_ctx_src->copy_event , ACL_EVENT_SYNC));
2062
+ }
2063
+ ACL_CHECK (aclrtRecordEvent (cann_ctx_src->copy_event , cann_ctx_src->stream ()));
2064
+
2065
+ // wait on dst stream for the copy to complete
2066
+ ggml_cann_set_device (cann_ctx_dst->device );
2067
+ ACL_CHECK (aclrtStreamWaitEvent (cann_ctx_dst->stream (), cann_ctx_src->copy_event ));
2052
2068
} else {
2053
2069
// src and dst are on the same backend
2054
2070
ACL_CHECK (aclrtMemcpyAsync (dst->data , copy_size, src->data , copy_size,
0 commit comments