From 4a9f60c201573128f73a65999b3e5cc497fae5c1 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 17:53:38 +0800
Subject: [PATCH 01/22] ggml-cpu: add nnpa compile flag

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/CMakeLists.txt              | 1 +
 ggml/include/ggml-cpu.h          | 1 +
 ggml/src/ggml-cpu/CMakeLists.txt | 7 +++++++
 ggml/src/ggml-cpu/ggml-cpu.c     | 8 ++++++++
 ggml/src/ggml-cpu/ggml-cpu.cpp   | 3 +++
 5 files changed, 20 insertions(+)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4e7399f9e68f9..215eb23486814 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -131,6 +131,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_NNPA             "ggml: enable nnpa"             ON)
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index de77a875ec533..e3b79d09bb66f 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -101,6 +101,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 52cae778cac18..fa4a655a70fdd 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -427,6 +427,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
         # TODO: Separation to determine activation of VX/VXE/VXE2
         if (${S390X_M} MATCHES "8561|8562")
+            set(GGML_NNPA OFF)
             message(STATUS "z15 target")
             list(APPEND ARCH_FLAGS -march=z15)
         elseif (${S390X_M} MATCHES "3931")
@@ -443,8 +444,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
 
         if (GGML_VXE)
+            message(STATUS "VX/VXE/VXE2 enabled")
             list(APPEND ARCH_FLAGS -mvx -mzvector)
         endif()
+
+        if (GGML_NNPA)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_NNPA)
+            message(STATUS "NNPA enabled")
+        endif()
     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
         message(STATUS "Wasm detected")
         list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1bb9c4e367f0f..add02d17bee21 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3364,6 +3364,14 @@ int ggml_cpu_has_vxe(void) {
 #endif
 }
 
+int ggml_cpu_has_nnpa(void) {
+#if defined(GGML_NNPA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_neon(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_NEON)
     return 1;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 735ef3f015c13..a98866a2d8052 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_vxe()) {
             features.push_back({ "VXE", "1" });
         }
+        if (ggml_cpu_has_nnpa()) {
+            features.push_back({ "NNPA", "1" });
+        }
         if (ggml_cpu_has_wasm_simd()) {
             features.push_back({ "WASM_SIMD", "1" });
         }

From 8d4a7987f9c1887f716be96250f2caeee0253929 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 18:43:52 +0800
Subject: [PATCH 02/22] ggml-cpu: add fp16->fp32 nnpa first

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu-impl.h | 12 +++++++++---
 ggml/src/ggml-cpu/simd-mappings.h |  8 +++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 73a8f93987aa3..d839cf5c55e81 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -62,11 +62,17 @@ struct ggml_compute_params {
 #if defined(__s390x__) && defined(__VEC__)
 #ifndef __VXE__
 #define __VXE__
-#endif
+#endif  // __VXE__
 #ifndef __VXE2__
 #define __VXE2__
-#endif
-#endif
+#endif  // __VXE2__
+#endif  // __s390x__ && __VEC__
+
+#if defined(__s390x__) && defined(GGML_NNPA)
+#ifndef __NNPA__
+#define __NNPA__
+#endif  // __NNPA__
+#endif  // __s390x__ && GGML_NNPA
 
 #if defined(__ARM_FEATURE_SVE)
 #include <sys/prctl.h>
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index e42364c59aa10..c901ef33f9bd2 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -962,7 +962,12 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_STEP GGML_F32_STEP
 #define GGML_F16_EPR  GGML_F32_EPR
 
-static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
+static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
+#ifdef __NNPA__
+    uint16x8_t tmp = vec_xl(0, (const ggml_fp16_t *)x);
+    uint16x8_t nnpa = vec_convert_from_fp16(tmp, 0);
+    return vec_extend_to_fp32_hi(nnpa, 0);
+#else
     float tmp[4];
 
     for (int i = 0; i < 4; i++) {
@@ -972,6 +977,7 @@ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
     return vec_xl(0, (const float *)(tmp));
+#endif
 }
 
 static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {

From 0ff0d6516247a41d2ade42b42cf0d676a4dd1627 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 19:10:27 +0800
Subject: [PATCH 03/22] ggml-cpu: add fp32->fp16

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index c901ef33f9bd2..a278646657a57 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -922,7 +922,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F32_STEP 32
 #define GGML_F32_EPR  4
 
-#define GGML_F32x4              __vector float
+#define GGML_F32x4              float32x4_t
 #define GGML_F32x4_ZERO         vec_splats(0.0f)
 #define GGML_F32x4_SET1         vec_splats
 #define GGML_F32x4_LOAD(p)      vec_xl(0, p)
@@ -980,7 +980,15 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
 #endif
 }
 
-static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t y) {
+#ifdef __NNPA__
+    float32x4_t zero = vec_splats(0.0f);
+    uint16x8_t nnpa = vec_round_from_fp32(y, zero, 0);
+    x[0] = nnpa[0];
+    x[1] = nnpa[1];
+    x[2] = nnpa[2];
+    x[3] = nnpa[3];
+#else
     float arr[4];
 
     // note: keep type-cast here to prevent compiler bugs
@@ -990,6 +998,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
     for (int i = 0; i < 4; i++) {
         x[i] = GGML_FP32_TO_FP16(arr[i]);
     }
+#endif
 }
 
 #define GGML_F16_VEC                GGML_F32x4

From a316d1b7fa957a06ac4f75d1fa30d8298a99f6c0 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 19:30:03 +0800
Subject: [PATCH 04/22] ggml-cpu: attempt direct reference

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index a278646657a57..c64b6438ce267 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -984,10 +984,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t y) {
 #ifdef __NNPA__
     float32x4_t zero = vec_splats(0.0f);
     uint16x8_t nnpa = vec_round_from_fp32(y, zero, 0);
-    x[0] = nnpa[0];
-    x[1] = nnpa[1];
-    x[2] = nnpa[2];
-    x[3] = nnpa[3];
+    x = nnpa;
 #else
     float arr[4];
 

From ff70b3aaf891bb3fca5cd4fce2bb2d74b3f8a3ea Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 19:30:56 +0800
Subject: [PATCH 05/22] Revert "ggml-cpu: attempt direct reference"

This reverts commit 23f3f5e5b57a7d8827333e2ac8e3f83e88c17fa2.

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index c64b6438ce267..a278646657a57 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -984,7 +984,10 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t y) {
 #ifdef __NNPA__
     float32x4_t zero = vec_splats(0.0f);
     uint16x8_t nnpa = vec_round_from_fp32(y, zero, 0);
-    x = nnpa;
+    x[0] = nnpa[0];
+    x[1] = nnpa[1];
+    x[2] = nnpa[2];
+    x[3] = nnpa[3];
 #else
     float arr[4];
 

From 2f58bbcbb89c183340e252362b2a40651f573f1f Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 19:40:56 +0800
Subject: [PATCH 06/22] ggml-cpu: better variable names

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index a278646657a57..773ed3de4e157 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -964,9 +964,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 
 static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
 #ifdef __NNPA__
-    uint16x8_t tmp = vec_xl(0, (const ggml_fp16_t *)x);
-    uint16x8_t nnpa = vec_convert_from_fp16(tmp, 0);
-    return vec_extend_to_fp32_hi(nnpa, 0);
+    uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
+    uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_x, 0);
+    return vec_extend_to_fp32_hi(nnpa_dlf16, 0);
 #else
     float tmp[4];
 
@@ -980,20 +980,20 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
 #endif
 }
 
-static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t y) {
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
 #ifdef __NNPA__
     float32x4_t zero = vec_splats(0.0f);
-    uint16x8_t nnpa = vec_round_from_fp32(y, zero, 0);
-    x[0] = nnpa[0];
-    x[1] = nnpa[1];
-    x[2] = nnpa[2];
-    x[3] = nnpa[3];
+    uint16x8_t v_x = vec_round_from_fp32(v_y, zero, 0);
+    x[0] = vec_extract(v_x, 0);
+    x[1] = vec_extract(v_x, 1);
+    x[2] = vec_extract(v_x, 2);
+    x[3] = vec_extract(v_x, 3);
 #else
     float arr[4];
 
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
-    vec_xst(y, 0, (float *)(arr));
+    vec_xst(v_y, 0, (float *)(arr));
 
     for (int i = 0; i < 4; i++) {
         x[i] = GGML_FP32_TO_FP16(arr[i]);

From ae9c5f928a9da40a8d2d3b7901b9ca1cba8f30ad Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 19:54:38 +0800
Subject: [PATCH 07/22] ggml-cpu: add ggml fp16->fp32 and fp32->fp16 scalar
 simd

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h |  4 ++--
 ggml/src/ggml-impl.h              | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 773ed3de4e157..c3b7bcfe1cc37 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -982,8 +982,8 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
 
 static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
 #ifdef __NNPA__
-    float32x4_t zero = vec_splats(0.0f);
-    uint16x8_t v_x = vec_round_from_fp32(v_y, zero, 0);
+    float32x4_t v_zero = vec_splats(0.0f);
+    uint16x8_t v_x = vec_round_from_fp32(v_y, v_zero, 0);
     x[0] = vec_extract(v_x, 0);
     x[1] = vec_extract(v_x, 1);
     x[2] = vec_extract(v_x, 2);
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 6dc5ce0d92fd8..d28b3cce552e7 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -322,6 +322,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
+// on s390x, we use ZDNN_DLFLOAT16 with NNPA
 //
 // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
 // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
@@ -417,6 +418,29 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
     #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
     #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 
+#elif defined(__NNPA__)
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+    // TODO: Determine if inline assembly is faster
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        uint16x8_t v_h = vec_splats(h);
+        uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_h, 0);
+        return vec_extend_to_fp32_hi(nnpa_dlf16, 0)[0];
+    }
+
+    // TODO: Determine if inline assembly is faster
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        float32x4_t v_f = vec_splats(f);
+        float32x4_t v_zero = vec_splats(0.0f);
+        uint16x8_t v_h = vec_round_from_fp32(v_f, v_zero, 0);
+        return vec_extract(v_h, 0);
+    }
+
 #else
 
     // FP16 <-> FP32

From a88843aa10fcc220be21c9e430311062c56f6ef7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 21:01:37 +0800
Subject: [PATCH 08/22] ggml-cpu: switch fp16->fp32 to inline asm and test

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-impl.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index d28b3cce552e7..9b2c54c0b1abd 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -428,9 +428,17 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
 
     // TODO: Determine if inline assembly is faster
     static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        uint16x8_t v_h = vec_splats(h);
-        uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_h, 0);
-        return vec_extend_to_fp32_hi(nnpa_dlf16, 0)[0];
+        float f;
+        __asm__ (
+            "vlvgp  %%v0, %1, %1\n"
+            "vreph  %%v0, %%v0, 3\n"
+            "vcnf   %%v0, %%v0, 0, 1\n"
+            "vclfnh %%v0, %%v0, 2, 0\n"
+            "ler    %0, %%f0\n" :
+            /* out */   "=f"(f) :
+            /* in */     "r"(h) :
+            /* clobber */ "v0", "f0");
+        return f;
     }
 
     // TODO: Determine if inline assembly is faster

From 70ff4e6b314700d27781b2e041e695ef6c5c2181 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 21:48:07 +0800
Subject: [PATCH 09/22] Revert "ggml-cpu: switch fp16->fp32 to inline asm and
 test"

This reverts commit a88843aa10fcc220be21c9e430311062c56f6ef7.

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-impl.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 9b2c54c0b1abd..d28b3cce552e7 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -428,17 +428,9 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
 
     // TODO: Determine if inline assembly is faster
     static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        float f;
-        __asm__ (
-            "vlvgp  %%v0, %1, %1\n"
-            "vreph  %%v0, %%v0, 3\n"
-            "vcnf   %%v0, %%v0, 0, 1\n"
-            "vclfnh %%v0, %%v0, 2, 0\n"
-            "ler    %0, %%f0\n" :
-            /* out */   "=f"(f) :
-            /* in */     "r"(h) :
-            /* clobber */ "v0", "f0");
-        return f;
+        uint16x8_t v_h = vec_splats(h);
+        uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_h, 0);
+        return vec_extend_to_fp32_hi(nnpa_dlf16, 0)[0];
     }
 
     // TODO: Determine if inline assembly is faster

From 2b4892e20f9229f888a443d1942a7a5bb0c7fdf6 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 21:49:41 +0800
Subject: [PATCH 10/22] ggml-cpu: chore: remove todo comments about inline asm

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-impl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index d28b3cce552e7..db90eb1de9932 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -426,14 +426,12 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
     #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
     #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 
-    // TODO: Determine if inline assembly is faster
     static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
         uint16x8_t v_h = vec_splats(h);
         uint16x8_t nnpa_dlf16 = vec_convert_from_fp16(v_h, 0);
         return vec_extend_to_fp32_hi(nnpa_dlf16, 0)[0];
     }
 
-    // TODO: Determine if inline assembly is faster
     static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
         float32x4_t v_f = vec_splats(f);
         float32x4_t v_zero = vec_splats(0.0f);

From 01b929491b50071a5d0572235dcf5a449da70aa7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 22:13:11 +0800
Subject: [PATCH 11/22] docs: update s390x docs

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 docs/build-s390x.md | 39 ++++++++++++++++++++++++++++-----------
 docs/build.md       |  4 ++++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/docs/build-s390x.md b/docs/build-s390x.md
index f44038c586ddc..9b2f421774ae6 100644
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -28,8 +28,9 @@ cmake --build build --config Release -j $(nproc)
 ```
 
 **Notes**:
-- For faster repeated compilation, install [ccache](https://ccache.dev/)
-- By default, VXE/VXE2 is enabled. To disable it (not recommended):
+
+-   For faster repeated compilation, install [ccache](https://ccache.dev/)
+-   By default, VXE/VXE2 is enabled. To disable it (not recommended):
 
     ```bash
     cmake -S . -B build             \
@@ -41,18 +42,29 @@ cmake --build build --config Release -j $(nproc)
     cmake --build build --config Release -j $(nproc)
     ```
 
-- For debug builds:
+-   By default, NNPA is enabled when available. To disable it (not recommended):
+
+    ```bash
+    cmake -S . -B build             \
+        -DCMAKE_BUILD_TYPE=Release  \
+        -DGGML_BLAS=ON              \
+        -DGGML_BLAS_VENDOR=OpenBLAS \
+        -DGGML_NNPA=OFF
+
+    cmake --build build --config Release -j $(nproc)
+    ```
+
+-   For debug builds:
 
     ```bash
     cmake -S . -B build             \
         -DCMAKE_BUILD_TYPE=Debug    \
         -DGGML_BLAS=ON              \
         -DGGML_BLAS_VENDOR=OpenBLAS
-
     cmake --build build --config Debug -j $(nproc)
     ```
 
-- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+-   For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
 
     ```bash
     cmake -S . -B build             \
@@ -101,27 +113,33 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
     ```
 
     For example,
+
     ```bash
     python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
     mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
     ```
 
     **Notes:**
+
     - The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
 
 ## IBM Accelerators
 
 ### 1. SIMD Acceleration
 
-Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
+
+### 2. NNPA Vector Intrinsics Acceleration
 
-### 2. zDNN Accelerator
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
 
-*Only available in IBM z16 or later system. No direction at the moment.*
+### 3. zDNN Accelerator
 
-### 3. Spyre Accelerator
+_Only available in IBM z16 or later system. No direction at the moment._
 
-*No direction at the moment.*
+### 4. Spyre Accelerator
+
+_No direction at the moment._
 
 ## Performance Tuning
 
@@ -154,4 +172,3 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 2. **Other Questions**
 
     Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
-
diff --git a/docs/build.md b/docs/build.md
index 680b0d8398741..896a9946423cb 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -557,6 +557,10 @@ ninja
 
 To read documentation for how to build on Android, [click here](./android.md)
 
+## IBM Z & LinuxONE
+
+To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
+
 ## Notes about GPU-accelerated backends
 
 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

From dca6c7497e4b3e5f153dbb47d713f0c3f4ca3e07 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 22:42:48 +0800
Subject: [PATCH 12/22] ggml-cpu: add nnpa intrinsics for batched fp32->fp16

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index add02d17bee21..b577e032b7ceb 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3137,6 +3137,23 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         _mm_storel_epi64((__m128i *)(y + i), y_vec);
     }
 #endif
+
+#if defined(__NNPA__)
+    for (; i + 7 < n; i += 8) {
+        float32x4_t v_x1 = vec_xl(i + 0, x);
+        float32x4_t v_x2 = vec_xl(i + 4, x);
+        uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
+        vec_xst(v_dlf16, i, (uint16_t *)y);
+    }
+    // TODO: Enable bottom code once checks are done
+    // for (; i + 3 < n; i += 4) {
+    //     float32x4_t v_x = vec_xl(i, x);
+    //     float32x4_t v_zero = vec_splats(0.0f);
+    //     uint16x4_t v_dlf16 = vec_round_from_fp32(v_x, v_zero, 0);
+    //     vec_xst(v_dlf16, i, (uint16_t *)y);
+    // }
+#endif
+
     for (; i < n; ++i) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
     }

From 6b4469b882d3532212f84f26483d3cc559a1ebdd Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 20 Jun 2025 22:46:49 +0800
Subject: [PATCH 13/22] ggml-cpu: fix wrong displacement

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b577e032b7ceb..7ae185950b4d7 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3143,7 +3143,7 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         float32x4_t v_x1 = vec_xl(i + 0, x);
         float32x4_t v_x2 = vec_xl(i + 4, x);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
-        vec_xst(v_dlf16, i, (uint16_t *)y);
+        vec_xst(v_dlf16, 0, (uint16_t *)(y + i));
     }
     // TODO: Enable bottom code once checks are done
     // for (; i + 3 < n; i += 4) {

From c9d0f36f5e54a9185dfab80270d6ebd18d2b206f Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 01:52:47 +0800
Subject: [PATCH 14/22] ggml-cpu: change vector load displacement too

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 7ae185950b4d7..dbb9ff4eaf62b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,8 +3140,8 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
-        float32x4_t v_x1 = vec_xl(i + 0, x);
-        float32x4_t v_x2 = vec_xl(i + 4, x);
+        float32x4_t v_x1 = vec_ld(0, x + i + 0);
+        float32x4_t v_x2 = vec_ld(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
         vec_xst(v_dlf16, 0, (uint16_t *)(y + i));
     }

From 22669f38bd704fd9a953faf308de1601c625e14e Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 01:53:30 +0800
Subject: [PATCH 15/22] ggml-cpu: fix wrong vector intrinsic func

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index dbb9ff4eaf62b..c0a01f8365e14 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,8 +3140,8 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
-        float32x4_t v_x1 = vec_ld(0, x + i + 0);
-        float32x4_t v_x2 = vec_ld(0, x + i + 4);
+        float32x4_t v_x1 = vec_xl(0, x + i + 0);
+        float32x4_t v_x2 = vec_xl(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
         vec_xst(v_dlf16, 0, (uint16_t *)(y + i));
     }

From 5530bec16eeeb4089f074e216f39fe75d310fe87 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 01:55:24 +0800
Subject: [PATCH 16/22] ggml-cpu: add sigint for gdb to break

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index c0a01f8365e14..f3cbcde38d774 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3143,6 +3143,7 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         float32x4_t v_x1 = vec_xl(0, x + i + 0);
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
+        raise(SIGINT);
         vec_xst(v_dlf16, 0, (uint16_t *)(y + i));
     }
     // TODO: Enable bottom code once checks are done

From 5d478c791e4862ba79602df333165e0455f9b5d1 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:01:47 +0800
Subject: [PATCH 17/22] wip: move vector store to tmp variable for debugging

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index f3cbcde38d774..a8494cf97b12e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,11 +3140,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
+        uint16x8_t tmp[8];
         float32x4_t v_x1 = vec_xl(0, x + i + 0);
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
+        vec_xst(v_dlf16, 0, tmp);
         raise(SIGINT);
-        vec_xst(v_dlf16, 0, (uint16_t *)(y + i));
     }
     // TODO: Enable bottom code once checks are done
     // for (; i + 3 < n; i += 4) {

From dc29eed5ebd4fe89f79a6d36e9d16ec4a9859282 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:03:03 +0800
Subject: [PATCH 18/22] wip: change vector to scalar data type

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a8494cf97b12e..b575e7ac3ba94 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,7 +3140,7 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
-        uint16x8_t tmp[8];
+        uint16_t tmp[8];
         float32x4_t v_x1 = vec_xl(0, x + i + 0);
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);

From 1be4514357c50b3ab106e262840471a5ab4f75d7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:05:49 +0800
Subject: [PATCH 19/22] wip: vec_round_from_fp32 seem to be throwing rounding
 errors

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b575e7ac3ba94..668602a474294 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,10 +3140,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
-        uint16_t tmp[8];
+        int16_t tmp[8];
         float32x4_t v_x1 = vec_xl(0, x + i + 0);
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
-        uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
+        int16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
         vec_xst(v_dlf16, 0, tmp);
         raise(SIGINT);
     }

From 733066b659e4339fd9ae105be2882e2d2f7c65b7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:06:51 +0800
Subject: [PATCH 20/22] Revert "wip: vec_round_from_fp32 seem to be throwing
 rounding errors"

This reverts commit 1be4514357c50b3ab106e262840471a5ab4f75d7.

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 668602a474294..b575e7ac3ba94 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3140,10 +3140,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
 
 #if defined(__NNPA__)
     for (; i + 7 < n; i += 8) {
-        int16_t tmp[8];
+        uint16_t tmp[8];
         float32x4_t v_x1 = vec_xl(0, x + i + 0);
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
-        int16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
+        uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
         vec_xst(v_dlf16, 0, tmp);
         raise(SIGINT);
     }

From 5d845799a33203a1bea6c11f41b4ccbffd549ef7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:08:17 +0800
Subject: [PATCH 21/22] wip: double check original impl

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu.c      | 2 +-
 ggml/src/ggml-cpu/simd-mappings.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b575e7ac3ba94..774f91e1fee1c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3145,7 +3145,7 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         float32x4_t v_x2 = vec_xl(0, x + i + 4);
         uint16x8_t v_dlf16 = vec_round_from_fp32(v_x1, v_x2, 0);
         vec_xst(v_dlf16, 0, tmp);
-        raise(SIGINT);
+        // raise(SIGINT);
     }
     // TODO: Enable bottom code once checks are done
     // for (; i + 3 < n; i += 4) {
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index c3b7bcfe1cc37..3137d825a188d 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -988,6 +988,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
     x[1] = vec_extract(v_x, 1);
     x[2] = vec_extract(v_x, 2);
     x[3] = vec_extract(v_x, 3);
+    raise(SIGINT);
 #else
     float arr[4];
 

From 3f0cbf76d6f6186ee36193444838bdd16b685ffd Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sat, 21 Jun 2025 02:09:54 +0800
Subject: [PATCH 22/22] wip: add missing import

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/simd-mappings.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 3137d825a188d..e95fdd0fd9b49 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml-cpu-impl.h"
+#include <signal.h>
 
 //
 // simd mappings