33
33
#include < aclnnop/aclnn_group_norm.h>
34
34
#include < aclnnop/aclnn_index_fill_tensor.h>
35
35
#include < aclnnop/aclnn_layer_norm.h>
36
+ #include < aclnnop/aclnn_mm.h>
37
+ #include < aclnnop/aclnn_batch_matmul.h>
36
38
#include < aclnnop/aclnn_matmul.h>
37
39
#include < aclnnop/aclnn_max_pool.h>
38
40
#include < aclnnop/aclnn_permute.h>
@@ -2423,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2423
2425
aclTensor* acl_weight, aclTensor* acl_dst) {
2424
2426
int8_t cube_math_type = 1 ; // ALLOW_FP32_DOWN_PRECISION, when input is
2425
2427
// fp32, atlas a2 will transpose it to HFLOAT32.
2426
-
2427
2428
uint64_t workspaceSize = 0 ;
2428
2429
aclOpExecutor* executor;
2429
2430
void * workspaceAddr = nullptr ;
@@ -2441,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
2441
2442
aclnnMatmul (workspaceAddr, workspaceSize, executor, ctx.stream ()));
2442
2443
}
2443
2444
2445
+ /* *
2446
+ * @brief Performs matrix multiplication of two 2D tensors.
2447
+ *
2448
+ * This function computes the matrix multiplication of the input tensor
2449
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2450
+ * destination tensor `acl_dst`.
2451
+ * The operation is defined as:
2452
+ * \f[
2453
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2454
+ * \f]
2455
+ *
2456
+ * @param ctx The context for the CANN backend operations.
2457
+ * @param acl_input The input tensor for the matrix multiplication.
2458
+ * @param acl_weight The weight tensor for the matrix multiplication.
2459
+ * @param acl_dst The destination tensor where the result of the matrix
2460
+ * multiplication will be stored.
2461
+ */
2462
+ static void aclnn_mat_mul_2d (ggml_backend_cann_context& ctx, aclTensor* acl_input,
2463
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2464
+ int8_t cube_math_type = 2 ;
2465
+ uint64_t workspaceSize = 0 ;
2466
+ aclOpExecutor* executor;
2467
+ void * workspaceAddr = nullptr ;
2468
+
2469
+ ACL_CHECK (aclnnMmGetWorkspaceSize (acl_input, acl_weight, acl_dst,
2470
+ cube_math_type, &workspaceSize,
2471
+ &executor));
2472
+
2473
+ if (workspaceSize > 0 ) {
2474
+ ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize);
2475
+ workspaceAddr = workspace_allocator.get ();
2476
+ }
2477
+
2478
+ ACL_CHECK (
2479
+ aclnnMm (workspaceAddr, workspaceSize, executor, ctx.stream ()));
2480
+ }
2481
+
2482
+ /* *
2483
+ * @brief Performs matrix multiplication of two 3D tensors.
2484
+ *
2485
+ * This function computes the matrix multiplication of the input tensor
2486
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
2487
+ * destination tensor `acl_dst`.
2488
+ * The operation is defined as:
2489
+ * \f[
2490
+ * \text {acl_dst}=\text {acl_input@acl_weight}
2491
+ * \f]
2492
+ *
2493
+ * @param ctx The context for the CANN backend operations.
2494
+ * @param acl_input The input tensor for the matrix multiplication.
2495
+ * @param acl_weight The weight tensor for the matrix multiplication.
2496
+ * @param acl_dst The destination tensor where the result of the matrix
2497
+ * multiplication will be stored.
2498
+ */
2499
+ static void aclnn_mat_mul_3d (ggml_backend_cann_context& ctx, aclTensor* acl_input,
2500
+ aclTensor* acl_weight, aclTensor* acl_dst) {
2501
+ int8_t cube_math_type = 2 ;
2502
+ uint64_t workspaceSize = 0 ;
2503
+ aclOpExecutor* executor;
2504
+ void * workspaceAddr = nullptr ;
2505
+
2506
+ ACL_CHECK (aclnnBatchMatMulGetWorkspaceSize (acl_input, acl_weight, acl_dst,
2507
+ cube_math_type, &workspaceSize,
2508
+ &executor));
2509
+
2510
+ if (workspaceSize > 0 ) {
2511
+ ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize);
2512
+ workspaceAddr = workspace_allocator.get ();
2513
+ }
2514
+
2515
+ ACL_CHECK (
2516
+ aclnnBatchMatMul (workspaceAddr, workspaceSize, executor, ctx.stream ()));
2517
+ }
2518
+
2444
2519
/* *
2445
2520
* @brief Performs matrix multiplication with floating-point precision on
2446
2521
* tensors using the CANN backend.
@@ -2462,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2462
2537
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
2463
2538
BCAST_MUL_MAT_SHAPE (input, weight, dst);
2464
2539
2465
- // transpose weight: [1,2,3,4] -> [1,2,4,3]
2466
- int64_t transpose_ne[] = {bcast_weight_ne[1 ], bcast_weight_ne[0 ],
2467
- bcast_weight_ne[2 ], bcast_weight_ne[3 ],
2468
- bcast_weight_ne[4 ], bcast_weight_ne[5 ]};
2469
- size_t transpose_nb[] = {bcast_weight_nb[1 ], bcast_weight_nb[0 ],
2470
- bcast_weight_nb[2 ], bcast_weight_nb[3 ],
2471
- bcast_weight_nb[4 ], bcast_weight_nb[5 ]};
2540
+ int64_t n_dims = bcast_dims;
2541
+ if (bcast_input_ne[3 ] == bcast_weight_ne[3 ] && bcast_input_ne[3 ] == 1 ) {
2542
+ if (bcast_input_ne[2 ] == 1 && bcast_weight_ne[2 ] == 1 ) {
2543
+ n_dims = 2 ;
2544
+ } else if (bcast_input_ne[2 ] == 1 ) {
2545
+ n_dims = 3 ;
2546
+ }
2547
+ }
2472
2548
2473
- aclTensor* acl_weight_tensor =
2474
- ggml_cann_create_tensor (weight, transpose_ne, transpose_nb, bcast_dims);
2475
2549
aclTensor* acl_input_tensor =
2476
- ggml_cann_create_tensor (input, BCAST_MUL_MAT_PARAM (input));
2477
- aclTensor* acl_dst = ggml_cann_create_tensor (dst, BCAST_MUL_MAT_PARAM (dst));
2478
- aclnn_mat_mul (ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2550
+ ggml_cann_create_tensor (input, bcast_input_ne, bcast_input_nb, n_dims);
2551
+ int64_t transpose_ne[] = {
2552
+ bcast_weight_ne[1 ], bcast_weight_ne[0 ],
2553
+ bcast_weight_ne[2 ], bcast_weight_ne[3 ],
2554
+ bcast_weight_ne[4 ], bcast_weight_ne[5 ]
2555
+ };
2556
+ size_t transpose_nb[] = {
2557
+ bcast_weight_nb[1 ], bcast_weight_nb[0 ],
2558
+ bcast_weight_nb[2 ], bcast_weight_nb[3 ],
2559
+ bcast_weight_nb[4 ], bcast_weight_nb[5 ]
2560
+ };
2561
+ aclTensor* acl_weight_tensor =
2562
+ ggml_cann_create_tensor (weight, transpose_ne, transpose_nb, n_dims);
2563
+ aclTensor* acl_dst =
2564
+ ggml_cann_create_tensor (dst, bcast_dst_ne, bcast_dst_nb, n_dims);
2565
+
2566
+ switch (n_dims) {
2567
+ case 2 :
2568
+ aclnn_mat_mul_2d (ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2569
+ break ;
2570
+ case 3 :
2571
+ aclnn_mat_mul_3d (ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2572
+ break ;
2573
+ default :
2574
+ aclnn_mat_mul (ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
2575
+ break ;
2576
+ }
2479
2577
2480
2578
ACL_CHECK (aclDestroyTensor (acl_weight_tensor));
2481
2579
ACL_CHECK (aclDestroyTensor (acl_input_tensor));
@@ -2501,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2501
2599
ggml_tensor* src0 = dst->src [0 ]; // weight
2502
2600
ggml_tensor* src1 = dst->src [1 ]; // input
2503
2601
2504
- // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
2505
- // is regarded as batch. weight need transpose.
2506
- int64_t weight_ne[] = {src0->ne [1 ], src0->ne [0 ]};
2602
+ // The shape of the weight is NCHW.
2603
+ // Matrix multiplication uses HW dims.
2604
+ // HC is regarded as batch.
2605
+ // weight need transpose.
2507
2606
float weight_elem_size;
2508
2607
if (type == GGML_TYPE_Q4_0) {
2509
2608
weight_elem_size = float (sizeof (uint8_t )) / 2 ;
2510
- }
2511
- else if (type == GGML_TYPE_Q8_0) {
2609
+ } else if (type == GGML_TYPE_Q8_0) {
2512
2610
weight_elem_size = float (sizeof (uint8_t ));
2513
- }
2514
- else {
2611
+ } else {
2515
2612
GGML_ABORT (" Only support Q4_0 and Q8_0 MUL_MAT" );
2516
2613
}
2517
- float weight_nb[] = {weight_elem_size * src0->ne [0 ], weight_elem_size};
2518
-
2519
- // size of one matrix is element_size * height * width.
2520
- size_t weight_stride = weight_elem_size * src0->ne [0 ] * src0->ne [1 ];
2614
+ float weight_nb[] = {src0->ne [0 ] * weight_elem_size, weight_elem_size};
2615
+ size_t weight_stride = src0->ne [1 ] * src0->ne [0 ] * weight_elem_size;
2521
2616
size_t weight_size = weight_stride * src0->ne [2 ] * src0->ne [3 ];
2522
2617
2523
2618
// scale stored at the end of weight. Also need transpose.
2524
- GGML_ASSERT (QK4_0 == QK8_0);
2525
- int64_t scale_ne[] = {src0->ne [1 ], src0->ne [0 ] / QK8_0};
2526
2619
size_t scale_elem_size = sizeof (uint16_t );
2527
- size_t scale_nb[] = {src0->ne [0 ] / QK8_0 * scale_elem_size,
2528
- scale_elem_size};
2529
- size_t scale_stride = scale_elem_size * src0->ne [0 ] * src0->ne [1 ] / QK8_0;
2620
+ size_t scale_nb[] = {src0->ne [0 ] / QK8_0 * scale_elem_size, scale_elem_size};
2621
+ size_t scale_stride = src0->ne [1 ] * src0->ne [0 ] / QK8_0 * scale_elem_size;
2530
2622
char * scale_offset = (char *)src0->data + weight_size;
2531
2623
2532
2624
// input
2533
- void * input_buffer;
2534
2625
size_t input_elem_size = sizeof (uint16_t );
2535
2626
int64_t input_ne[] = {src1->ne [0 ], src1->ne [1 ]};
2536
- size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne [0 ]};
2537
- size_t input_stride = input_elem_size * src1->ne [0 ] * src1->ne [1 ];
2538
-
2627
+ size_t input_nb[] = {input_elem_size, input_ne[0 ] * input_elem_size};
2628
+ size_t input_stride = input_ne[0 ] * input_ne[1 ] * input_elem_size;
2539
2629
ggml_cann_pool_alloc input_alloctor (ctx.pool ());
2630
+ void * input_buffer = src1->data ;
2631
+
2632
+ // case in
2540
2633
if (src1->type != GGML_TYPE_F16) {
2541
2634
aclTensor* acl_src1_tensor = ggml_cann_create_tensor (src1);
2542
- input_alloctor.alloc (ggml_nelements (src1) * input_elem_size);
2543
- input_buffer = input_alloctor.get ();
2635
+ input_buffer = input_alloctor.alloc (ggml_nelements (src1) * input_elem_size);
2544
2636
2545
2637
int64_t * input_cast_ne = src1->ne ;
2546
2638
size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2550,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2550
2642
}
2551
2643
2552
2644
aclTensor* acl_input_tensor = ggml_cann_create_tensor (
2553
- input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2554
- input_cast_nb, GGML_MAX_DIMS);
2645
+ input_buffer,
2646
+ ACL_FLOAT16,
2647
+ input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
2555
2648
aclnn_cast (ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2649
+
2556
2650
ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2557
2651
ACL_CHECK (aclDestroyTensor (acl_src1_tensor));
2558
- } else {
2559
- input_buffer = src1->data ;
2560
2652
}
2561
2653
2562
2654
// output
2563
2655
size_t output_elem_size = sizeof (uint16_t );
2564
- int64_t output_ne[] = {dst->ne [0 ], dst->ne [1 ]};
2565
- size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne [0 ]};
2566
- ggml_cann_pool_alloc output_alloctor (
2567
- ctx.pool (), ggml_nelements (dst) * output_elem_size);
2568
- void * output_buffer = output_alloctor.get ();
2569
- size_t output_stride = output_elem_size * dst->ne [0 ] * dst->ne [1 ];
2656
+ size_t output_nb[] = {output_elem_size, dst->ne [0 ] * output_elem_size};
2657
+ ggml_cann_pool_alloc output_allocator (ctx.pool ());
2658
+ void * output_buffer = output_allocator.alloc (ggml_nelements (dst) * output_elem_size);
2659
+ size_t output_stride = dst->ne [0 ] * dst->ne [1 ] * output_elem_size;
2570
2660
2571
2661
// aclnn
2662
+ int64_t max_elem_size = 65535 ;
2663
+ int64_t split_size = (src0->ne [1 ] / max_elem_size) + 1 ;
2664
+ ggml_cann_pool_alloc workspace_allocator (ctx.pool ());
2665
+ aclOpExecutor* executor = nullptr ;
2572
2666
uint64_t workspaceSize = 0 ;
2573
- aclOpExecutor* executor;
2574
2667
void * workspaceAddr = nullptr ;
2575
-
2576
2668
for (int64_t n1 = 0 ; n1 < src1->ne [3 ]; n1++) {
2577
2669
for (int64_t c1 = 0 ; c1 < src1->ne [2 ]; c1++) {
2578
2670
int64_t n0 = n1 / (src1->ne [3 ] / src0->ne [3 ]);
2579
2671
int64_t c0 = c1 / (src1->ne [2 ] / src0->ne [2 ]);
2580
2672
2581
- int64_t batch1 = n1 * src1->ne [2 ] + c1;
2582
- int64_t batch0 = n0 * src0->ne [2 ] + c0;
2673
+ int64_t batch1 = ( n1 * src1->ne [2 ]) + c1;
2674
+ int64_t batch0 = ( n0 * src0->ne [2 ]) + c0;
2583
2675
2584
2676
aclTensor* acl_input_tensor = ggml_cann_create_tensor (
2585
2677
(char *)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2586
2678
input_elem_size, input_ne, input_nb, 2 );
2679
+
2680
+ // first split
2681
+ int64_t weight_ne_offset = 0 ;
2682
+ int64_t weight_ne[2 ] = {max_elem_size > src0->ne [1 ] ? src0->ne [1 ] : max_elem_size, src0->ne [0 ]};
2683
+ int64_t scale_ne_offset = 0 ;
2684
+ int64_t scale_ne[2 ] = {weight_ne[0 ], weight_ne[1 ] / QK8_0};
2685
+ int64_t output_ne_offset = 0 ;
2686
+ int64_t output_ne[2 ] = {weight_ne[0 ], dst->ne [1 ]};
2687
+
2587
2688
aclTensor* acl_weight_tensor = ggml_cann_create_tensor (
2588
2689
(char *)src0->data + batch0 * weight_stride,
2589
- ggml_cann_type_mapping (type), weight_elem_size, weight_ne,
2590
- weight_nb, 2 );
2690
+ ggml_cann_type_mapping (type),
2691
+ weight_elem_size, weight_ne, weight_nb, 2 ,
2692
+ ACL_FORMAT_ND, weight_ne_offset);
2591
2693
aclTensor* acl_scale_tensor = ggml_cann_create_tensor (
2592
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2593
- scale_elem_size, scale_ne, scale_nb, 2 );
2694
+ scale_offset + batch0 * scale_stride,
2695
+ ACL_FLOAT16,
2696
+ scale_elem_size, scale_ne, scale_nb, 2 ,
2697
+ ACL_FORMAT_ND, scale_ne_offset);
2594
2698
aclTensor* acl_output_tensor = ggml_cann_create_tensor (
2595
- (char *)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2596
- output_elem_size, output_ne, output_nb, 2 );
2699
+ (char *)output_buffer + batch1 * output_stride,
2700
+ ACL_FLOAT16,
2701
+ output_elem_size, output_ne, output_nb, 2 ,
2702
+ ACL_FORMAT_ND, output_ne_offset);
2597
2703
2598
2704
ACL_CHECK (aclnnWeightQuantBatchMatmulV2GetWorkspaceSize (
2599
- acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr ,
2600
- nullptr , nullptr , nullptr , QK8_0, acl_output_tensor,
2601
- &workspaceSize, &executor));
2602
-
2603
- if (workspaceSize > 0 && workspaceAddr == nullptr ) {
2604
- ggml_cann_pool_alloc workspace_allocator (ctx.pool (),
2605
- workspaceSize);
2606
- workspaceAddr = workspace_allocator.get ();
2705
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2706
+ nullptr , nullptr , nullptr , nullptr , QK8_0,
2707
+ acl_output_tensor, &workspaceSize, &executor));
2708
+ if (workspaceAddr == nullptr ) {
2709
+ workspaceAddr = workspace_allocator.alloc (workspaceSize);
2607
2710
}
2608
-
2609
2711
ACL_CHECK (aclnnWeightQuantBatchMatmulV2 (
2610
2712
workspaceAddr, workspaceSize, executor, ctx.stream ()));
2611
2713
2612
- ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2613
2714
ACL_CHECK (aclDestroyTensor (acl_weight_tensor));
2614
2715
ACL_CHECK (aclDestroyTensor (acl_scale_tensor));
2615
2716
ACL_CHECK (aclDestroyTensor (acl_output_tensor));
2717
+
2718
+ // other splits
2719
+ for (int64_t split = 1 ; split < split_size; split++) {
2720
+ weight_ne_offset += weight_elem_size * weight_ne[0 ] * weight_ne[1 ];
2721
+ weight_ne[0 ] = max_elem_size * (split + 1 ) > src0->ne [1 ] ? src0->ne [1 ] - (max_elem_size * split) : max_elem_size;
2722
+ scale_ne_offset += scale_elem_size * scale_ne[0 ] * scale_ne[1 ];
2723
+ scale_ne[0 ] = weight_ne[0 ];
2724
+ output_ne_offset += output_elem_size * output_ne[0 ] * output_ne[1 ];
2725
+ output_ne[0 ] = weight_ne[0 ];
2726
+
2727
+ acl_weight_tensor = ggml_cann_create_tensor (
2728
+ (char *)src0->data + batch0 * weight_stride,
2729
+ ggml_cann_type_mapping (type),
2730
+ weight_elem_size, weight_ne, weight_nb, 2 ,
2731
+ ACL_FORMAT_ND, weight_ne_offset);
2732
+ acl_scale_tensor = ggml_cann_create_tensor (
2733
+ scale_offset + batch0 * scale_stride,
2734
+ ACL_FLOAT16,
2735
+ scale_elem_size, scale_ne, scale_nb, 2 ,
2736
+ ACL_FORMAT_ND, scale_ne_offset);
2737
+ acl_output_tensor = ggml_cann_create_tensor (
2738
+ (char *)output_buffer + batch1 * output_stride,
2739
+ ACL_FLOAT16,
2740
+ output_elem_size, output_ne, output_nb, 2 ,
2741
+ ACL_FORMAT_ND, output_ne_offset);
2742
+
2743
+ ACL_CHECK (aclnnWeightQuantBatchMatmulV2GetWorkspaceSize (
2744
+ acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
2745
+ nullptr , nullptr , nullptr , nullptr , QK8_0,
2746
+ acl_output_tensor, &workspaceSize, &executor));
2747
+ ACL_CHECK (aclnnWeightQuantBatchMatmulV2 (
2748
+ workspaceAddr, workspaceSize, executor, ctx.stream ()));
2749
+
2750
+ ACL_CHECK (aclDestroyTensor (acl_weight_tensor));
2751
+ ACL_CHECK (aclDestroyTensor (acl_scale_tensor));
2752
+ ACL_CHECK (aclDestroyTensor (acl_output_tensor));
2753
+ }
2754
+
2755
+ ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2616
2756
}
2617
2757
}
2618
2758
2619
2759
// cast out
2620
- int64_t * output_cast_ne = dst->ne ;
2621
- size_t output_cast_nb[GGML_MAX_DIMS];
2622
- output_cast_nb[0 ] = sizeof (uint16_t );
2623
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2624
- output_cast_nb[i] = output_cast_nb[i - 1 ] * output_cast_ne[i - 1 ];
2625
- }
2760
+ if (dst->type != GGML_TYPE_F16) {
2761
+ int64_t * output_cast_ne = dst->ne ;
2762
+ size_t output_cast_nb[GGML_MAX_DIMS];
2763
+ output_cast_nb[0 ] = sizeof (uint16_t );
2764
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2765
+ output_cast_nb[i] = output_cast_nb[i - 1 ] * output_cast_ne[i - 1 ];
2766
+ }
2626
2767
2627
- aclTensor* acl_output_tensor =
2628
- ggml_cann_create_tensor (output_buffer, ACL_FLOAT16, output_elem_size,
2629
- output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2630
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor (dst);
2631
- aclnn_cast (ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
2768
+ aclTensor* acl_output_tensor = ggml_cann_create_tensor (
2769
+ output_buffer,
2770
+ ACL_FLOAT16,
2771
+ output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2772
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor (dst);
2773
+ aclnn_cast (ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping (dst->type ));
2632
2774
2633
- ACL_CHECK (aclDestroyTensor (acl_output_tensor));
2634
- ACL_CHECK (aclDestroyTensor (acl_dst_tensor));
2775
+ ACL_CHECK (aclDestroyTensor (acl_output_tensor));
2776
+ ACL_CHECK (aclDestroyTensor (acl_dst_tensor));
2777
+ }
2635
2778
}
2636
2779
2637
2780
void ggml_cann_mul_mat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
0 commit comments