Skip to content

Commit 5421f63

Browse files
authored
CANN: Fix precision issue on 310I DUO multi-devices (#15784)
1 parent 820bc98 commit 5421f63

File tree

4 files changed

+28
-16
lines changed

4 files changed

+28
-16
lines changed

docs/backend/CANN.md

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
293293

294294
## Environment variable setup
295295

296-
### GGML_CANN_ASYNC_MODE
297-
298-
Enables asynchronous operator submission. Disabled by default.
299-
300296
### GGML_CANN_MEM_POOL
301297

302-
Specifies the memory pool management strategy:
298+
Specifies the memory pool management strategy, Default is vmm.
303299

304300
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
305301

306302
- prio: Employs a priority queue-based memory pool management.
303+
307304
- leg: Uses a fixed-size buffer pool.
308305

309306
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@@ -312,9 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
312309

313310
### GGML_CANN_WEIGHT_NZ
314311

315-
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
312+
Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
316313

317-
### GGML_CANN_DISABLE_ACL_GRAPH
314+
### GGML_CANN_ACL_GRAPH
318315

319-
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
320-
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
316+
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1956,7 +1956,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
19561956
aclTensor* acl_weight_tensor;
19571957

19581958
// Only check env once.
1959-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1959+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
19601960
if (weight_to_nz && is_matmul_weight(weight)) {
19611961
int64_t acl_stride[2] = {1, transpose_ne[1]};
19621962

ggml/src/ggml-cann/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
420420
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
421421
device, async_mode ? "ON" : "OFF");
422422
#ifdef USE_ACL_GRAPH
423-
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
423+
acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
424424
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
425425
__func__, device,
426426
acl_graph_mode ? "GRAPH" : "EAGER",

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
11961196
// Why aclrtSynchronizeDevice?
11971197

11981198
// Only check env once.
1199-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1199+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
12001200
if (!need_transform(tensor->type)) {
12011201
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
12021202
ACL_MEMCPY_HOST_TO_DEVICE));
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
12791279
ACL_MEMCPY_DEVICE_TO_DEVICE));
12801280
return true;
12811281
} else {
1282+
#ifdef ASCEND_310P
1283+
// TODO: Support 310p P2P copy
1284+
return false;
1285+
#endif
12821286
// Different device but can access by peer.
12831287
int32_t canAccessPeer = 0;
12841288
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
14391443
int64_t ne0 = tensor->ne[0];
14401444

14411445
// Only check env once.
1442-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1446+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
14431447

14441448
// last line must bigger than 32, because every single op deal at
14451449
// least 32 bytes.
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
20002004
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
20012005
ggml_backend_is_cann(backend_dst));
20022006

2007+
GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
2008+
20032009
if (!ggml_backend_buffer_is_cann(src->buffer) ||
20042010
!ggml_backend_buffer_is_cann(dst->buffer)) {
20052011
return false;
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
20202026
return true;
20212027
}
20222028
if (backend_src != backend_dst) {
2029+
#ifdef ASCEND_310P
2030+
// TODO: Support 310p P2P copy
2031+
return false;
2032+
#endif
20232033
ggml_backend_cann_buffer_context* buf_ctx_src =
20242034
(ggml_backend_cann_buffer_context*)buf_src->context;
20252035
ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
20362046
}
20372047

20382048
// need open both directions for memcpyasync between devices.
2039-
ggml_cann_set_device(cann_ctx_dst->device);
20402049
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
20412050
ggml_cann_set_device(cann_ctx_src->device);
20422051
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
20472056
ACL_MEMCPY_DEVICE_TO_DEVICE,
20482057
cann_ctx_src->stream()));
20492058

2050-
//TODO: workaround for Event didn`t work here.
2051-
aclrtSynchronizeStream(cann_ctx_src->stream());
2059+
// record event on src stream after the copy
2060+
if (!cann_ctx_src->copy_event) {
2061+
ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
2062+
}
2063+
ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
2064+
2065+
// wait on dst stream for the copy to complete
2066+
ggml_cann_set_device(cann_ctx_dst->device);
2067+
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
20522068
} else {
20532069
// src and dst are on the same backend
20542070
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,

0 commit comments

Comments
 (0)