akashveramd
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 3 deletions b/‎CMakeLists.txt
Lines changed: 3 additions & 3 deletions
diff --git a/‎Jenkinsfile
Lines changed: 17 additions & 22 deletions b/‎Jenkinsfile
Lines changed: 17 additions & 22 deletions
diff --git a/‎client_example/25_wrapper/wrapper_img2col.cpp
Lines changed: 0 additions & 1 deletion b/‎client_example/25_wrapper/wrapper_img2col.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎example/01_gemm/README.md
Lines changed: 0 additions & 14 deletions b/‎example/01_gemm/README.md
Lines changed: 0 additions & 14 deletions
diff --git a/‎example/02_gemm_bilinear/README.md
Lines changed: 0 additions & 17 deletions b/‎example/02_gemm_bilinear/README.md
Lines changed: 0 additions & 17 deletions
diff --git a/‎example/04_gemm_add_add_fastgelu/README.md
Lines changed: 0 additions & 13 deletions b/‎example/04_gemm_add_add_fastgelu/README.md
Lines changed: 0 additions & 13 deletions
diff --git a/‎example/09_convnd_fwd/README.md
Lines changed: 0 additions & 14 deletions b/‎example/09_convnd_fwd/README.md
Lines changed: 0 additions & 14 deletions
diff --git a/‎example/15_grouped_gemm/README.md
Lines changed: 0 additions & 16 deletions b/‎example/15_grouped_gemm/README.md
Lines changed: 0 additions & 16 deletions
diff --git a/‎example/26_contraction/README.md
Lines changed: 0 additions & 11 deletions b/‎example/26_contraction/README.md
Lines changed: 0 additions & 11 deletions
diff --git a/‎example/30_grouped_conv_fwd_multiple_d/README.md
Lines changed: 0 additions & 12 deletions b/‎example/30_grouped_conv_fwd_multiple_d/README.md
Lines changed: 0 additions & 12 deletions
diff --git a/‎example/46_gemm_add_multiply/README.md
Lines changed: 0 additions & 16 deletions b/‎example/46_gemm_add_multiply/README.md
Lines changed: 0 additions & 16 deletions
diff --git a/‎include/ck/ck.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/ck.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ck/host_utility/device_prop.hpp
Lines changed: 3 additions & 3 deletions b/‎include/ck/host_utility/device_prop.hpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
Lines changed: 2 additions & 2 deletions b/‎include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
Lines changed: 2 additions & 2 deletions b/‎include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
Lines changed: 2 additions & 2 deletions b/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
Lines changed: 1 addition & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
Lines changed: 1 addition & 1 deletion
@@ -202,18 +202,18 @@ endif()
 
 
 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
-option(USE_OPT_NAVI3X "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
+option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
 
 if(USE_BITINT_EXTENSION_INT4)
     add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
     add_compile_options(-Wno-bit-int-extension)
     message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
 endif()
 
-if(USE_OPT_NAVI3X)
+if(USE_OPT_GFX11)
     add_compile_options(-mcumode)
     add_compile_options(-mno-wavefrontsize64)
-    message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
+    message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
 endif()
 
 ## Threads
 
@@ -515,38 +515,33 @@ def Build_CK(Map conf=[:]){
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                 timeout(time: 24, unit: 'HOURS')
                 {
-                    //check whether running on Navi or MI300 node
-                    def navi_node = 0
-                    def mi300_node = 0
+                    //check whether to run performance tests on this node
+                    def do_perf_tests = 0
                     sh 'rocminfo | tee rocminfo.log'
-                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){
-                        navi_node = 1
-                        echo "This is a Navi node"
-                    }
-                    if ( runShell('grep -n "gfx942" rocminfo.log') ){
-                        mi300_node = 1
-                        echo "This is MI300 node"
+                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
+                        do_perf_tests = 1
+                        echo "Stash profiler and run performance tests"
                     }
                     cmake_build(conf)
                     dir("build"){
                         //run tests and examples
                         sh 'make -j check'
-                        if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){
+                        if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){
                             //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            //do not stash profiler on Navi or MI300 nodes
+                            //do not stash profiler on nodes where we don't need to run performance tests
                             sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
                             stash name: "ckProfiler.tar.gz"
                         }
-                        if (params.RUN_FULL_QA && mi300_node == 0 ){
-                            // build deb packages for all MI100/200/300 targets and prepare to export
+                        if (params.RUN_FULL_QA && do_perf_tests == 0 ){
+                            // build deb packages for all gfx9 targets and prepare to export
                             sh 'make -j package'
                             archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
                             archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
                             sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
                             stash name: "ckprofiler_0.2.0_amd64.deb"
                         }
                     }
-                    if (params.hipTensor_test && navi_node == 0 ){
+                    if (params.hipTensor_test && do_perf_tests == 0 ){
                         //build and test hipTensor
                         sh """#!/bin/bash
                             rm -rf "${params.hipTensor_branch}".zip
@@ -814,7 +809,7 @@ pipeline {
         {
             parallel
             {
-                stage("Run Codegen Tests on MI200")
+                stage("Run Codegen Tests on gfx90a")
                 {
                     when {
                         beforeAgent true
@@ -865,7 +860,7 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on MI300")
+                stage("Build CK and run Tests on gfx942")
                 {
                     when {
                         beforeAgent true
@@ -885,7 +880,7 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on MI200")
+                stage("Build CK and run Tests on gfx90a")
                 {
                     when {
                         beforeAgent true
@@ -925,13 +920,13 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on Navi21")
+                stage("Build CK and run Tests on gfx1030")
                 {
                     when {
                         beforeAgent true
                         expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
                     }
-                    agent{ label rocmnode("navi21") }
+                    agent{ label rocmnode("gfx1030") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ 
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
@@ -945,13 +940,13 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on Navi32")
+                stage("Build CK and run Tests on gfx1101")
                 {
                     when {
                         beforeAgent true
                         expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
                     }
-                    agent{ label rocmnode("navi32") }
+                    agent{ label rocmnode("gfx1101") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
 
@@ -181,4 +181,3 @@ int main(int argc, char* argv[])
                              {1, 1, 1} /*filter_dilations*/);
     return 0;
 }
-// MI100 Perf:   0.255178 ms, 1698.9 GB/s,
@@ -7,17 +7,3 @@
 #arg3: run kernel # of times (>1)
 ./bin/example_gemm_xdl 0 1 5
 ```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.19685 ms, 107.657 TFlops, 78.8501 GB/s
-```
@@ -9,20 +9,3 @@
 #arg11 to 12: alpha, beta
 ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
 ```
-Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c0_grid_desc_m_n_{ 3840, 4096}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 1 times...
-Perf: 0.936965 ms, 137.517 TFlops, 102.959 GB/s
-error: 0
-max_diff: 0, 558.5, 558.5
-```
@@ -8,16 +8,3 @@
 #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
 ./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
 ```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
-d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8>
-```
@@ -16,17 +16,3 @@
 # <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_convnd_fwd_xdl 0 1 100
 ```
-
-Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
-```
-input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{432, 165888, 4}
-arg.b_grid_desc_k0_n_k1_{432, 256, 4}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 100 times...
-Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s
-```
@@ -7,19 +7,3 @@
 #arg3: run kernel # of times (>1)
 ./bin/example_grouped_gemm_xdl_fp16 0 1 5
 ```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
-gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
-gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
-gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
-group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
-group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
-group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
-group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
-launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} 
-Warm up
-Start running 5 times...
-Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
-```
@@ -7,14 +7,3 @@
 #arg3: time kernel (0=no, 1=yes)
 ./bin/example_contraction_bilinear_xdl_fp32 1 1 1
 ```
-
-Result (MI100 @ dynammic freq, 46TFlops peak FP32)
-```
-a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
-c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
-launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
-```
@@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims):
 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```
 
-Result (MI100)
-```
-in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
-wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
-bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
-out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default>
-```
@@ -8,19 +8,3 @@
 #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
 ./bin/example_gemm_add_multiply_dl_fp16 1 1 1
 ```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
-d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.e_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1>
-```
@@ -236,7 +236,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #ifndef CK_WORKAROUND_DENORM_FIX
 #define CK_WORKAROUND_DENORM_FIX 0
 #else
-// enable only on MI200
+// enable only for gfx90a
 #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
 #endif // CK_WORKAROUND_DENORM_FIX
 
 
@@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported()
            ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942";
 }
 
-inline bool is_navi1_supported()
+inline bool is_gfx101_supported()
 {
     return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" ||
            ck::get_device_name() == "gfx1012";
 }
 
-inline bool is_navi2_supported()
+inline bool is_gfx103_supported()
 {
     return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" ||
            ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" ||
            ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036";
 }
 
-inline bool is_navi3_supported()
+inline bool is_gfx11_supported()
 {
     return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
            ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
 
@@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_gfx11_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
 
@@ -648,7 +648,7 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-           ck::is_navi2_supported() || ck::is_navi3_supported())
+           ck::is_gfx103_supported() || ck::is_gfx11_supported())
         {
             bool pass = true;
             pass      = pass && arg.K_ % K1 == 0;
 
@@ -858,7 +858,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
 
     static bool IsSupportedArgument(const RawArg& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_gfx11_supported())
         {
             if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
             {
@@ -1435,7 +1435,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
 #if 0
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_gfx11_supported())
         {
             if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
             {
 
@@ -1392,8 +1392,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
     static bool IsSupportedArgument(const Argument& arg)
     {
         // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::is_navi2_supported() ||
-             ck::is_navi3_supported()))
+        if(!(ck::get_device_name() == "gfx906" || ck::is_gfx103_supported() ||
+             ck::is_gfx11_supported()))
         {
             return false;
         }
 
@@ -509,7 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_gfx11_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, ck::half_t> ||
                            is_same_v<AccDataType, int32_t>))
 
@@ -535,8 +535,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
             }
         }
 
-        if(ck::get_device_name() == "gfx906" || ck::is_navi2_supported() ||
-           ck::is_navi3_supported())
+        if(ck::get_device_name() == "gfx906" || ck::is_gfx103_supported() ||
+           ck::is_gfx11_supported())
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
 
@@ -168,7 +168,7 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
 
     static bool IsSupportedArgument(const Argument& karg)
     {
-        if(ck::is_navi2_supported() || ck::is_navi3_supported())
+        if(ck::is_gfx103_supported() || ck::is_gfx11_supported())
         {
             return GridwiseGemm::CheckValidity(karg);
         }
 
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-           ck::is_navi2_supported() || ck::is_navi3_supported())
+           ck::is_gfx103_supported() || ck::is_gfx11_supported())
         {
             return GridwiseGemm::CheckValidity(
                 arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);
Original file line number	Diff line number	Diff line change
`@@ -515,38 +515,33 @@ def Build_CK(Map conf=[:]){`
`515`	`515`	`withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {`
`516`	`516`	`timeout(time: 24, unit: 'HOURS')`
`517`	`517`	`{`
`518`		`- //check whether running on Navi or MI300 node`
`519`		`- def navi_node = 0`
`520`		`- def mi300_node = 0`
	`518`	`+ //check whether to run performance tests on this node`
	`519`	`+ def do_perf_tests = 0`
`521`	`520`	`sh 'rocminfo \| tee rocminfo.log'`
`522`		`- if ( runShell('grep -n "gfx1030" rocminfo.log') \|\| runShell('grep -n "gfx1101" rocminfo.log') ){`
`523`		`- navi_node = 1`
`524`		`- echo "This is a Navi node"`
`525`		`- }`
`526`		`- if ( runShell('grep -n "gfx942" rocminfo.log') ){`
`527`		`- mi300_node = 1`
`528`		`- echo "This is MI300 node"`
	`521`	`+ if ( runShell('grep -n "gfx1030" rocminfo.log') \|\| runShell('grep -n "gfx1101" rocminfo.log') \|\| runShell('grep -n "gfx942" rocminfo.log') ){`
	`522`	`+ do_perf_tests = 1`
	`523`	`+ echo "Stash profiler and run performance tests"`
`529`	`524`	`}`
`530`	`525`	`cmake_build(conf)`
`531`	`526`	`dir("build"){`
`532`	`527`	`//run tests and examples`
`533`	`528`	`sh 'make -j check'`
`534`		`- if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){`
	`529`	`+ if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){`
`535`	`530`	`//we only need the ckProfiler to run the performance tests, so we pack and stash it`
`536`		`- //do not stash profiler on Navi or MI300 nodes`
	`531`	`+ //do not stash profiler on nodes where we don't need to run performance tests`
`537`	`532`	`sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'`
`538`	`533`	`stash name: "ckProfiler.tar.gz"`
`539`	`534`	`}`
`540`		`- if (params.RUN_FULL_QA && mi300_node == 0 ){`
`541`		`- // build deb packages for all MI100/200/300 targets and prepare to export`
	`535`	`+ if (params.RUN_FULL_QA && do_perf_tests == 0 ){`
	`536`	`+ // build deb packages for all gfx9 targets and prepare to export`
`542`	`537`	`sh 'make -j package'`
`543`	`538`	`archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'`
`544`	`539`	`archiveArtifacts artifacts: 'composablekernel-tests_*.deb'`
`545`	`540`	`sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'`
`546`	`541`	`stash name: "ckprofiler_0.2.0_amd64.deb"`
`547`	`542`	`}`
`548`	`543`	`}`
`549`		`- if (params.hipTensor_test && navi_node == 0 ){`
	`544`	`+ if (params.hipTensor_test && do_perf_tests == 0 ){`
`550`	`545`	`//build and test hipTensor`
`551`	`546`	`sh """#!/bin/bash`
`552`	`547`	`rm -rf "${params.hipTensor_branch}".zip`
`@@ -814,7 +809,7 @@ pipeline {`
`814`	`809`	`{`
`815`	`810`	`parallel`
`816`	`811`	`{`
`817`		`- stage("Run Codegen Tests on MI200")`
	`812`	`+ stage("Run Codegen Tests on gfx90a")`
`818`	`813`	`{`
`819`	`814`	`when {`
`820`	`815`	`beforeAgent true`
`@@ -865,7 +860,7 @@ pipeline {`
`865`	`860`	`cleanWs()`
`866`	`861`	`}`
`867`	`862`	`}`
`868`		`- stage("Build CK and run Tests on MI300")`
	`863`	`+ stage("Build CK and run Tests on gfx942")`
`869`	`864`	`{`
`870`	`865`	`when {`
`871`	`866`	`beforeAgent true`
`@@ -885,7 +880,7 @@ pipeline {`
`885`	`880`	`cleanWs()`
`886`	`881`	`}`
`887`	`882`	`}`
`888`		`- stage("Build CK and run Tests on MI200")`
	`883`	`+ stage("Build CK and run Tests on gfx90a")`
`889`	`884`	`{`
`890`	`885`	`when {`
`891`	`886`	`beforeAgent true`
`@@ -925,13 +920,13 @@ pipeline {`
`925`	`920`	`cleanWs()`
`926`	`921`	`}`
`927`	`922`	`}`
`928`		`- stage("Build CK and run Tests on Navi21")`
	`923`	`+ stage("Build CK and run Tests on gfx1030")`
`929`	`924`	`{`
`930`	`925`	`when {`
`931`	`926`	`beforeAgent true`
`932`	`927`	`expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }`
`933`	`928`	`}`
`934`		`- agent{ label rocmnode("navi21") }`
	`929`	`+ agent{ label rocmnode("gfx1030") }`
`935`	`930`	`environment{`
`936`	`931`	`setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """`
`937`	`932`	`execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \`
`@@ -945,13 +940,13 @@ pipeline {`
`945`	`940`	`cleanWs()`
`946`	`941`	`}`
`947`	`942`	`}`
`948`		`- stage("Build CK and run Tests on Navi32")`
	`943`	`+ stage("Build CK and run Tests on gfx1101")`
`949`	`944`	`{`
`950`	`945`	`when {`
`951`	`946`	`beforeAgent true`
`952`	`947`	`expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }`
`953`	`948`	`}`
`954`		`- agent{ label rocmnode("navi32") }`
	`949`	`+ agent{ label rocmnode("gfx1101") }`
`955`	`950`	`environment{`
`956`	`951`	`setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """`
`957`	`952`	`execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \`
Original file line number	Diff line number	Diff line change
`@@ -181,4 +181,3 @@ int main(int argc, char* argv[])`
`181`	`181`	`{1, 1, 1} /filter_dilations/);`
`182`	`182`	`return 0;`
`183`	`183`	`}`
`184`		`-// MI100 Perf: 0.255178 ms, 1698.9 GB/s,`
Original file line number	Diff line number	Diff line change
`@@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported()`
`65`	`65`	`ck::get_device_name() == "gfx941" \|\| ck::get_device_name() == "gfx942";`
`66`	`66`	`}`
`67`	`67`
`68`		`-inline bool is_navi1_supported()`
	`68`	`+inline bool is_gfx101_supported()`
`69`	`69`	`{`
`70`	`70`	`return ck::get_device_name() == "gfx1010" \|\| ck::get_device_name() == "gfx1011" \|\|`
`71`	`71`	`ck::get_device_name() == "gfx1012";`
`72`	`72`	`}`
`73`	`73`
`74`		`-inline bool is_navi2_supported()`
	`74`	`+inline bool is_gfx103_supported()`
`75`	`75`	`{`
`76`	`76`	`return ck::get_device_name() == "gfx1030" \|\| ck::get_device_name() == "gfx1031" \|\|`
`77`	`77`	`ck::get_device_name() == "gfx1032" \|\| ck::get_device_name() == "gfx1034" \|\|`
`78`	`78`	`ck::get_device_name() == "gfx1035" \|\| ck::get_device_name() == "gfx1036";`
`79`	`79`	`}`
`80`	`80`
`81`		`-inline bool is_navi3_supported()`
	`81`	`+inline bool is_gfx11_supported()`
`82`	`82`	`{`
`83`	`83`	`return ck::get_device_name() == "gfx1100" \|\| ck::get_device_name() == "gfx1101" \|\|`
`84`	`84`	`ck::get_device_name() == "gfx1102" \|\| ck::get_device_name() == "gfx1103";`
Original file line number	Diff line number	Diff line change
`@@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle`
`829`	`829`
`830`	`830`	`static bool IsSupportedArgument(const Argument& arg)`
`831`	`831`	`{`
`832`		`- if(ck::is_navi3_supported())`
	`832`	`+ if(ck::is_gfx11_supported())`
`833`	`833`	`{`
`834`	`834`	`if constexpr(!(is_same_v<AccDataType, float> \|\| is_same_v<AccDataType, int32_t>))`
`835`	`835`	`{`
Original file line number	Diff line number	Diff line change
`@@ -648,7 +648,7 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,`
`648`	`648`	`static bool IsSupportedArgument(const Argument& arg)`
`649`	`649`	`{`
`650`	`650`	`if(ck::get_device_name() == "gfx906" \|\| ck::is_xdl_supported() \|\|`
`651`		`- ck::is_navi2_supported() \|\| ck::is_navi3_supported())`
	`651`	`+ ck::is_gfx103_supported() \|\| ck::is_gfx11_supported())`
`652`	`652`	`{`
`653`	`653`	`bool pass = true;`
`654`	`654`	`pass = pass && arg.K_ % K1 == 0;`
Original file line number	Diff line number	Diff line change
`@@ -858,7 +858,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle`
`858`	`858`
`859`	`859`	`static bool IsSupportedArgument(const RawArg& arg)`
`860`	`860`	`{`
`861`		`- if(ck::is_navi3_supported())`
	`861`	`+ if(ck::is_gfx11_supported())`
`862`	`862`	`{`
`863`	`863`	`if constexpr(!(is_same_v<Acc0DataType, float> \|\| is_same_v<Acc0DataType, int32_t>))`
`864`	`864`	`{`
`@@ -1435,7 +1435,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle`
`1435`	`1435`	`#if 0`
`1436`	`1436`	`static bool IsSupportedArgument(const Argument& arg)`
`1437`	`1437`	`{`
`1438`		`- if(ck::is_navi3_supported())`
	`1438`	`+ if(ck::is_gfx11_supported())`
`1439`	`1439`	`{`
`1440`	`1440`	`if constexpr(!(is_same_v<Acc0DataType, float> \|\| is_same_v<Acc0DataType, int32_t>))`
`1441`	`1441`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1392,8 +1392,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl`
`1392`	`1392`	`static bool IsSupportedArgument(const Argument& arg)`
`1393`	`1393`	`{`
`1394`	`1394`	`// check device`
`1395`		`- if(!(ck::get_device_name() == "gfx906" \|\| ck::is_navi2_supported() \|\|`
`1396`		`- ck::is_navi3_supported()))`
	`1395`	`+ if(!(ck::get_device_name() == "gfx906" \|\| ck::is_gfx103_supported() \|\|`
	`1396`	`+ ck::is_gfx11_supported()))`
`1397`	`1397`	`{`
`1398`	`1398`	`return false;`
`1399`	`1399`	`}`
Original file line number	Diff line number	Diff line change
`@@ -509,7 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB<ALayout,`
`509`	`509`
`510`	`510`	`static bool IsSupportedArgument(const Argument& arg)`
`511`	`511`	`{`
`512`		`- if(ck::is_navi3_supported())`
	`512`	`+ if(ck::is_gfx11_supported())`
`513`	`513`	`{`
`514`	`514`	`if constexpr(!(is_same_v<AccDataType, float> \|\| is_same_v<AccDataType, ck::half_t> \|\|`
`515`	`515`	`is_same_v<AccDataType, int32_t>))`
Original file line number	Diff line number	Diff line change
`@@ -535,8 +535,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,`
`535`	`535`	`}`
`536`	`536`	`}`
`537`	`537`
`538`		`- if(ck::get_device_name() == "gfx906" \|\| ck::is_navi2_supported() \|\|`
`539`		`- ck::is_navi3_supported())`
	`538`	`+ if(ck::get_device_name() == "gfx906" \|\| ck::is_gfx103_supported() \|\|`
	`539`	`+ ck::is_gfx11_supported())`
`540`	`540`	`{`
`541`	`541`	`return GridwiseGemm::CheckValidity(`
`542`	`542`	`arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);`
Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,`
`168`	`168`
`169`	`169`	`static bool IsSupportedArgument(const Argument& karg)`
`170`	`170`	`{`
`171`		`- if(ck::is_navi2_supported() \|\| ck::is_navi3_supported())`
	`171`	`+ if(ck::is_gfx103_supported() \|\| ck::is_gfx11_supported())`
`172`	`172`	`{`
`173`	`173`	`return GridwiseGemm::CheckValidity(karg);`
`174`	`174`	`}`