From a000d7cc7d5c47e5890c1474c48dbdb08b5e2509 Mon Sep 17 00:00:00 2001 From: apicalshark <58538165+apicalshark@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:25:48 +0800 Subject: [PATCH 1/3] Merge PR (#10) (#11) (#13) Merge --------- Signed-off-by: dependabot[bot] Co-authored-by: dennyxbox890 <58538165+dennyxbox890@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build.yml | 1466 ++++----------------- .github/workflows/docker.yml | 128 -- .github/workflows/editorconfig.yml | 27 - .github/workflows/nix-ci-aarch64.yml | 72 - .github/workflows/nix-ci.yml | 79 -- .github/workflows/nix-flake-update.yml | 22 - .github/workflows/nix-publish-flake.yml | 36 - .github/workflows/python-lint.yml | 23 - .github/workflows/server.yml | 190 --- CMakeLists.txt | 19 +- common/CMakeLists.txt | 10 + examples/CMakeLists.txt | 10 + examples/llava/requirements.txt | 2 +- examples/server/tests/requirements.txt | 4 +- ggml/CMakeLists.txt | 10 + ggml/src/CMakeLists.txt | 10 + ggml/src/ggml-cann/kernels/CMakeLists.txt | 11 + ggml/src/vulkan-shaders/CMakeLists.txt | 11 + pocs/CMakeLists.txt | 10 + pocs/vdot/CMakeLists.txt | 11 + poetry.lock | 34 +- src/CMakeLists.txt | 11 + tests/CMakeLists.txt | 11 + 23 files changed, 427 insertions(+), 1780 deletions(-) delete mode 100644 .github/workflows/docker.yml delete mode 100644 .github/workflows/editorconfig.yml delete mode 100644 .github/workflows/nix-ci-aarch64.yml delete mode 100644 .github/workflows/nix-ci.yml delete mode 100644 .github/workflows/nix-flake-update.yml delete mode 100644 .github/workflows/nix-publish-flake.yml delete mode 100644 .github/workflows/python-lint.yml delete mode 100644 .github/workflows/server.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 423173b975897..40ec8e738ff34 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,1199 +1,303 @@ name: CI on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: "Create new release" + required: true + type: boolean + push: + branches: + - master + paths: + [ + ".github/workflows/build.yml", + "**/CMakeLists.txt", + "**/Makefile", + "**/*.h", + "**/*.hpp", + "**/*.c", + "**/*.cpp", + "**/*.cu", + "**/*.cuh", + "**/*.swift", + "**/*.m", + "**/*.metal", + ] + pull_request: + types: [opened, synchronize, reopened] + paths: + [ + ".github/workflows/build.yml", + "**/CMakeLists.txt", + "**/Makefile", + "**/*.h", + "**/*.hpp", + "**/*.c", + "**/*.cpp", + "**/*.cu", + "**/*.cuh", + "**/*.swift", + "**/*.m", + "**/*.metal", + ] concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true # Fine-grant permission # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - contents: write # for creating release +#permissions: +# contents: write # for creating release +permissions: write-all env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GGML_NLOOP: 3 - GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} +# GGML_NLOOP: 3 +# GGML_N_THREADS: 1 +# LLAMA_LOG_COLORS: 1 +# LLAMA_LOG_PREFIX: 1 +# LLAMA_LOG_TIMESTAMPS: 1 jobs: - macOS-latest-cmake-arm64: - runs-on: macos-14 - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip - name: llama-bin-macos-arm64.zip - - macOS-latest-cmake-x64: - runs-on: macos-12 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip - name: llama-bin-macos-x64.zip - - ubuntu-focal-make: - runs-on: ubuntu-20.04 - env: - LLAMA_NODE_AVAILABLE: true - LLAMA_PYTHON_AVAILABLE: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential gcc-8 - - - uses: actions/setup-node@v4 - with: - node-version: "20" - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - run: | - CC=gcc-8 make -j $(nproc) - - - name: Test - id: make_test - run: | - CC=gcc-8 make tests -j $(nproc) - make test -j $(nproc) - - ubuntu-focal-make-curl: - runs-on: ubuntu-20.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - LLAMA_CURL: 1 - run: | - CC=gcc-8 make -j $(nproc) - - ubuntu-latest-cmake: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* - - - name: Upload artifacts + # CUDA Release + + ubuntu-latest-cmake: + runs-on: ubuntu-latest + + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: true + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Dependencies + id: depends + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install build-essential libcurl4-openssl-dev ccache cuda-toolkit-12-6 + sudo apt install software-properties-common + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt update + sudo apt install gcc-13 g++-13 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 13 --slave /usr/bin/g++ g++ /usr/bin/g++-13 + + - name: Build + id: cmake_build + run: | + export PATH=/usr/local/cuda-12.6/bin${PATH:+:${PATH}} + export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON + cmake --build . --config Release -j $(nproc) + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp LICENSE ./build/bin/ + cp $(find . -name "libllama.so") ./build/bin/ + cp $(find . -name "libggml.so") ./build/bin/ + zip -r cudart-llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: cudart-llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip + name: cudart-llama-bin-ubuntu-x64.zip + + # Vulkan Release + + ubuntu-latest-vulkan-cmake: + runs-on: ubuntu-latest + + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: true + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Dependencies + id: depends + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - + sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + sudo apt-get update + sudo apt-get install build-essential libcurl4-openssl-dev vulkan-sdk ccache + sudo apt install software-properties-common + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt update + sudo apt install gcc-13 g++-13 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 13 --slave /usr/bin/g++ g++ /usr/bin/g++-13 + + - name: Build + id: cmake_build + run: | + export PATH=/usr/local/cuda-12.6/bin${PATH:+:${PATH}} + export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON + cmake --build . --config Release -j $(nproc) + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp LICENSE ./build/bin/ + cp $(find . -name "libllama.so") ./build/bin/ + cp $(find . -name "libggml.so") ./build/bin/ + zip -r vulkan-llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: vulkan-llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip + name: vulkan-llama-bin-ubuntu-x64.zip + + # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # how to debug it. + # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 + + release: + permissions: write-all if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip - name: llama-bin-ubuntu-x64.zip - - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-cmake-rpc: - runs-on: ubuntu-latest - - continue-on-error: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake -DGGML_RPC=ON .. - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ubuntu-22-cmake-vulkan: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list - sudo apt-get update -y - sudo apt-get install -y build-essential vulkan-sdk - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake -DGGML_VULKAN=ON .. - cmake --build . --config Release -j $(nproc) - - ubuntu-22-cmake-hip: - runs-on: ubuntu-22.04 - container: rocm/dev-ubuntu-22.04:6.0.2 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev - - - name: Build with native CMake HIP support - id: cmake_build - run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON - cmake --build build --config Release -j $(nproc) - - - name: Build with legacy HIP support - id: cmake_build_legacy_hip - run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON - cmake --build build2 --config Release -j $(nproc) - - ubuntu-22-cmake-sycl: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - mkdir build - cd build - cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. - cmake --build . --config Release -j $(nproc) - - ubuntu-22-cmake-sycl-fp16: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - mkdir build - cd build - cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. - cmake --build . --config Release -j $(nproc) - - # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 - macOS-latest-make: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: make_build - env: - LLAMA_FATAL_WARNINGS: 1 - run: | - GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: make_test - run: | - GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) - GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) - - # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 - # would be great if we fix these - macOS-latest-cmake: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - macOS-latest-cmake-ios: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -G Xcode .. \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-tvos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - mkdir build - cd build - cmake -G Xcode .. \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-swift: - runs-on: macos-latest - - strategy: - matrix: - destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: xcodebuild for swift package - id: xcodebuild - run: | - xcodebuild -scheme llama -destination "${{ matrix.destination }}" - - - name: Build Swift Example - id: make_build_swift_example - run: | - make swift - - windows-msys2: - runs-on: windows-latest - - strategy: - fail-fast: false - matrix: - include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: Setup ${{ matrix.sys }} - uses: msys2/setup-msys2@v2 - with: - update: true - msystem: ${{matrix.sys}} - install: >- - base-devel - mingw-w64-${{matrix.env}}-toolchain - mingw-w64-${{matrix.env}}-cmake - mingw-w64-${{matrix.env}}-openblas - - - name: Build using make - shell: msys2 {0} - run: | - make -j $(nproc) - - - name: Clean after building using make - shell: msys2 {0} - run: | - make clean - - - name: Build using make w/ OpenBLAS - shell: msys2 {0} - run: | - make GGML_OPENBLAS=1 -j $(nproc) - - - name: Build using CMake - shell: msys2 {0} - run: | - cmake -B build - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - - name: Clean after building using CMake - shell: msys2 {0} - run: | - rm -rf build - - - name: Build using CMake w/ OpenBLAS - shell: msys2 {0} - run: | - cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - windows-latest-cmake: - runs-on: windows-2019 - - env: - OPENBLAS_VERSION: 0.3.23 - SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.3.261.1 - - strategy: - matrix: - include: - - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Clone Kompute submodule - id: clone_kompute - if: ${{ matrix.build == 'kompute-x64' }} - run: | - git submodule update --init ggml/src/kompute - - - name: Download OpenBLAS - id: get_openblas - if: ${{ matrix.build == 'openblas-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" - curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/openblas - tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') - & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll - - - name: Install Vulkan SDK - id: get_vulkan - if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" - & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install - Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" - Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Build - id: cmake_build - run: | - cmake -S . -B build ${{ matrix.defines }} - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - - - name: Add libopenblas.dll - id: add_libopenblas_dll - if: ${{ matrix.build == 'openblas-x64' }} - run: | - cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll - cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt - - - name: Check AVX512F support - id: check_avx512f - if: ${{ matrix.build == 'avx512-x64' }} - continue-on-error: true - run: | - cd build - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe') - echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c - & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main - .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO" - - - name: Test - id: cmake_test - # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} - run: | - cd build - ctest -L main -C Release --verbose --timeout 900 - - - name: Test (Intel SDE) - id: cmake_test_sde - if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation - run: | - curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" - # for some weird reason windows tar doesn't like sde tar.xz - 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz - 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar - $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) - cd build - $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 - & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip - name: llama-bin-win-${{ matrix.build }}.zip - - windows-latest-cmake-cuda: - runs-on: windows-2019 - - strategy: - matrix: - cuda: ['12.2.0', '11.7.1'] - build: ['cuda'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install CUDA toolkit - id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.15 - with: - cuda: ${{ matrix.cuda }} - method: 'network' - sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON - cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip - name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - - - name: Copy and pack Cuda runtime - run: | - echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" - $dst='.\build\bin\cudart\' - robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* - - - name: Upload Cuda runtime - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip - - windows-latest-cmake-sycl: - runs-on: windows-latest - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install - run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - - name: Build - id: cmake_build - run: examples/sycl/win-build-sycl.bat - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin - echo "cp oneAPI running time dll files to ./build/bin done" - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip - name: llama-bin-win-sycl-x64.zip - - windows-latest-cmake-hip: - if: ${{ github.event.inputs.create_release != 'true' }} - runs-on: windows-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Build - id: cmake_build - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - - windows-latest-cmake-hip-release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - runs-on: windows-latest - - strategy: - matrix: - gpu_target: [gfx1100, gfx1101, gfx1030] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Build - id: cmake_build - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - md "build\bin\rocblas\library\" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - run: | - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - - ios-xcode-build: - runs-on: macos-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build - - android-build: - runs-on: ubuntu-latest - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: Set up JDK - uses: actions/setup-java@v3 - with: - java-version: 17 - distribution: zulu - - - name: Setup Android SDK - uses: android-actions/setup-android@v3 - with: - log-accepted-android-sdk-licenses: false - - - name: Build - run: | - cd examples/llama.android - - ./gradlew build --no-daemon - -# freeBSD-latest: -# runs-on: macos-12 -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Build -# uses: cross-platform-actions/action@v0.19.0 -# with: -# operating_system: freebsd -# version: '13.2' -# hypervisor: 'qemu' -# run: | -# sudo pkg update -# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas -# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu` - - release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - - runs-on: ubuntu-latest - - needs: - - ubuntu-focal-make - - ubuntu-latest-cmake - - macOS-latest-make - - macOS-latest-cmake - - windows-latest-cmake - - windows-latest-cmake-cuda - - windows-latest-cmake-hip-release - - macOS-latest-cmake-arm64 - - macOS-latest-cmake-x64 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Download artifacts - id: download-artifact - uses: actions/download-artifact@v4 - with: - path: ./artifact - - - name: Move artifacts - id: move_artifacts - run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release - - - name: Create release - id: create_release - uses: anzz1/action-create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ steps.tag.outputs.name }} - - name: Upload release - id: upload_release - uses: actions/github-script@v3 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - const path = require('path'); - const fs = require('fs'); - const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./artifact/release')) { - if (path.extname(file) === '.zip') { - console.log('uploadReleaseAsset', file); - await github.repos.uploadReleaseAsset({ - owner: context.repo.owner, - repo: context.repo.repo, - release_id: release_id, - name: file, - data: await fs.readFileSync(`./artifact/release/${file}`) - }); - } - } + runs-on: ubuntu-latest + + needs: + - ubuntu-latest-cmake + - ubuntu-latest-vulkan-cmake + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Download artifacts + id: download-artifact + uses: actions/download-artifact@v4 + with: + path: ./artifact + + - name: Move artifacts + id: move_artifacts + run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release + + - name: Create release + id: create_release + uses: anzz1/action-create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.tag.outputs.name }} + + - name: Upload release + id: upload_release + uses: actions/github-script@v3 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + const path = require('path'); + const fs = require('fs'); + const release_id = '${{ steps.create_release.outputs.id }}'; + for (let file of await fs.readdirSync('./artifact/release')) { + if (path.extname(file) === '.zip') { + console.log('uploadReleaseAsset', file); + await github.repos.uploadReleaseAsset({ + owner: context.repo.owner, + repo: context.repo.repo, + release_id: release_id, + name: file, + data: await fs.readFileSync(`./artifact/release/${file}`) + }); + } + } # ubuntu-latest-gcc: # runs-on: ubuntu-latest diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index a953cdac907ae..0000000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,128 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -# GitHub recommends pinning actions to a commit SHA. -# To get a newer version, you will need to update the SHA. -# You can also reference a tag or branch, but the action may change without warning. - -name: Publish Docker image - -on: - #pull_request: - push: - branches: - - master - paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] - workflow_dispatch: # allows manual triggering, useful for debugging - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - packages: write - -jobs: - push_to_registry: - name: Push Docker image to Docker Hub - #if: github.event.pull_request.draft == false - - runs-on: ubuntu-latest - env: - COMMIT_SHA: ${{ github.sha }} - strategy: - matrix: - config: - - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } - # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - steps: - - name: Check out the repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 # preserve git history, so we can determine the build number - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case - REPO_NAME="${{ github.event.repository.name }}" - - # determine tag name postfix (build number, commit hash) - if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="b${BUILD_NUMBER}" - else - SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}" - fi - - # list all tags possible - TAGS="" - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}," - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}" - - echo "output_tags=$TAGS" >> $GITHUB_OUTPUT - echo "output_tags=$TAGS" # print out for debugging - env: - GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - - name: Build and push Docker image (tagged + versioned) - if: github.event_name == 'push' - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.output_tags }} - file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index ae86e99275265..0000000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: EditorConfig Checker - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - pull_request: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@main - - run: editorconfig-checker diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml deleted file mode 100644 index 0da6acdf1c81e..0000000000000 --- a/.github/workflows/nix-ci-aarch64.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: Nix aarch64 builds - -on: - workflow_dispatch: # allows manual triggering - schedule: - # Rebuild daily rather than on every push because QEMU is expensive (e.g. - # 1.5h instead of minutes with the cold cache). - # - # randint(0, 59), randint(0, 23) - - cron: '26 12 * * *' - # But also rebuild if we touched any of the Nix expressions: - push: - branches: - - master - paths: ['**/*.nix', 'flake.lock'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['**/*.nix', 'flake.lock'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub - id-token: write - contents: read - -jobs: - nix-build-aarch64: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install QEMU - # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 - run: | - sudo apt-get update - sudo apt-get install -y qemu-user-static qemu-system-aarch64 - sudo usermod -a -G kvm $USER - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-platforms = aarch64-linux - extra-system-features = nixos-test kvm - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.aarch64-linux" - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --systems aarch64-linux - --flake - ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml deleted file mode 100644 index 8ecbbe53b4ed1..0000000000000 --- a/.github/workflows/nix-ci.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: Nix CI - -on: - workflow_dispatch: # allows manual triggering - push: - branches: - - master - pull_request: - types: [opened, synchronize, reopened] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub - id-token: write - contents: read - -jobs: - nix-eval: - strategy: - fail-fast: false - matrix: - os: [ ubuntu-latest, macos-latest ] - runs-on: ${{ matrix.os }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: List all flake outputs - run: nix flake show --all-systems - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" - nix-build: - strategy: - fail-fast: false - matrix: - os: [ ubuntu-latest, macos-latest ] - runs-on: ${{ matrix.os }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: llama-cpp - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --flake - ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml deleted file mode 100644 index 3a6a96e263e59..0000000000000 --- a/.github/workflows/nix-flake-update.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: update-flake-lock -on: - workflow_dispatch: - schedule: - - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00 - -jobs: - lockfile: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@main - - name: Update flake.lock - uses: DeterminateSystems/update-flake-lock@main - with: - pr-title: "nix: update flake.lock" - pr-labels: | - nix - pr-reviewers: philiptaron,SomeoneSerge - token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml deleted file mode 100644 index 2c3c1ebdaeff1..0000000000000 --- a/.github/workflows/nix-publish-flake.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes -name: "Publish a flake to flakestry & flakehub" -on: - push: - tags: - - "*" - workflow_dispatch: - inputs: - tag: - description: "The existing tag to publish" - type: "string" - required: true -jobs: - flakestry-publish: - runs-on: ubuntu-latest - permissions: - id-token: "write" - contents: "read" - steps: - - uses: flakestry/flakestry-publish@main - with: - version: "${{ inputs.tag || github.ref_name }}" - flakehub-publish: - runs-on: "ubuntu-latest" - permissions: - id-token: "write" - contents: "read" - steps: - - uses: "actions/checkout@v4" - with: - ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}" - - uses: "DeterminateSystems/nix-installer-action@main" - - uses: "DeterminateSystems/flakehub-push@main" - with: - visibility: "public" - tag: "${{ inputs.tag }}" diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml deleted file mode 100644 index a8d46f31dd4f5..0000000000000 --- a/.github/workflows/python-lint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: flake8 Lint - -on: [push, pull_request] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - flake8-lint: - runs-on: ubuntu-latest - name: Lint - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: flake8 Lint - uses: py-actions/flake8@v2 - with: - plugins: "flake8-no-print" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml deleted file mode 100644 index 699ac095d6c83..0000000000000 --- a/.github/workflows/server.yml +++ /dev/null @@ -1,190 +0,0 @@ -# Server build and tests -name: Server - -on: - workflow_dispatch: # allows manual triggering - inputs: - sha: - description: 'Commit SHA1 to build' - required: false - type: string - slow_tests: - description: 'Run slow tests' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - -env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - server: - runs-on: ubuntu-latest - - strategy: - matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken - build_type: [RelWithDebInfo] - include: - - build_type: Release - sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - - steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libcurl4-openssl-dev - - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - - name: Verify server deps - id: verify_server_deps - run: | - git config --global --add safe.directory $(realpath .) - cd examples/server - git ls-files --others --modified - git status - ./deps.sh - git status - not_ignored_files="$(git ls-files --others --modified)" - echo "Modified files: ${not_ignored_files}" - if [ -n "${not_ignored_files}" ]; then - echo "Repository is dirty or server deps are not built as expected" - echo "${not_ignored_files}" - exit 1 - fi - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Tests - id: server_integration_tests - run: | - cd examples/server/tests - PORT=8888 ./tests.sh - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow - - - server-windows: - runs-on: windows-2019 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: libCURL - id: get_libcurl - env: - CURL_VERSION: 8.6.0_6 - run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" - mkdir $env:RUNNER_TEMP/libcurl - tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl - - - name: Build - id: cmake_build - run: | - cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - - name: Copy Libcurl - id: prepare_libcurl - run: | - cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll - - - name: Tests - id: server_integration_tests - if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} - run: | - cd examples/server/tests - $env:PYTHONIOENCODING = ":replace" - behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/CMakeLists.txt b/CMakeLists.txt index ef0932a7b9277..04df6f5c5bcf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,22 @@ project("llama.cpp" C CXX) include(CheckIncludeFileCXX) #set(CMAKE_WARN_DEPRECATED YES) -set(CMAKE_WARN_UNUSED_CLI YES) +#set(CMAKE_WARN_UNUSED_CLI YES) + +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + +#no warn +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -51,7 +66,7 @@ endif() # # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" OFF) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) # build diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 042e895add5e2..8dc7b49e4176b 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,5 +1,15 @@ # common +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) find_package(Threads REQUIRED) # Build info header diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ead630661c8e2..047081e8c144e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,15 @@ # dependencies +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) find_package(Threads REQUIRED) # third-party diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index cbcbf26c9b4e9..9269ed2f4083b 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -1,5 +1,5 @@ -r ../../requirements/requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -pillow~=10.2.0 +pillow~=10.4.0 torch~=2.2.1 torchvision~=0.17.1 diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 5539548720ff1..5ea3b858b1211 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,7 +1,7 @@ -aiohttp~=3.9.3 +aiohttp~=3.10.10 behave~=1.2.6 huggingface_hub~=0.23.2 numpy~=1.26.4 openai~=1.30.3 prometheus-client~=0.20.0 -requests~=2.32.3 +requests~=2.32.2 diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index cfa6e3f70e4a3..d21a62af75949 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -1,7 +1,17 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. project("ggml" C CXX) include(CheckIncludeFileCXX) +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 7365ac91bf5f2..32b3ac98d7e32 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1,5 +1,15 @@ include(CheckCXXCompilerFlag) +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) unset(GGML_CDEF_PUBLIC) add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 5b4fef91b5877..16f7ddb6c6628 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -1,3 +1,14 @@ +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) if (NOT SOC_TYPE) set (SOC_TYPE "Ascend910B3") endif() diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/vulkan-shaders/CMakeLists.txt index 10075db337737..99ac9b56910d3 100644 --- a/ggml/src/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/vulkan-shaders/CMakeLists.txt @@ -1,3 +1,14 @@ +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) find_package (Threads REQUIRED) set(TARGET vulkan-shaders-gen) diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04be65..273ffce61752d 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -1,5 +1,15 @@ # dependencies +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) find_package(Threads REQUIRED) # third-party diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt index d5405ad2991d1..2ccfeeaee83af 100644 --- a/pocs/vdot/CMakeLists.txt +++ b/pocs/vdot/CMakeLists.txt @@ -1,3 +1,14 @@ +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) set(TARGET llama-vdot) add_executable(${TARGET} vdot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/poetry.lock b/poetry.lock index eb6baa6c749c0..9acb8b7bae5b2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "atomicwrites" @@ -31,13 +31,13 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p [[package]] name = "certifi" -version = "2024.2.2" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, - {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -251,24 +251,24 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t [[package]] name = "idna" -version = "3.6" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, - {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] name = "jinja2" -version = "3.1.3" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, - {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -1066,13 +1066,13 @@ reference = "pytorch" [[package]] name = "tqdm" -version = "4.66.2" +version = "4.66.3" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"}, - {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"}, + {file = "tqdm-4.66.3-py3-none-any.whl", hash = "sha256:4f41d54107ff9a223dca80b53efe4fb654c67efaba7f47bada3ee9d50e05bd53"}, + {file = "tqdm-4.66.3.tar.gz", hash = "sha256:23097a41eba115ba99ecae40d06444c15d1c0c698d527a01c6c8bd1c5d0647e5"}, ] [package.dependencies] @@ -1165,13 +1165,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.1" +version = "2.2.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, - {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, ] [package.extras] @@ -1194,4 +1194,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9" -content-hash = "c8c4cc87637266a7b85debcbafa8887c5ad81cc8ef40e98a3f52c7c50af05c03" +content-hash = "0bcd190f2fc81c926bb25e891c22ce86b513cc8aa6e40b3d73b5b18104caab88" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 46a6ad56202f7..64b8fb81d0c5d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,3 +1,14 @@ +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) # TODO: should not use this if (WIN32) if (BUILD_SHARED_LIBS) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 08ad66b49fdd4..c06037b4ee2c1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,14 @@ +#enable O3 flag +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_FLAGS "-Wall -Wextra -Wno-unused-variable -Wno-unused-function -Wno-error") +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") + +#add lto +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) function(llama_test target) include(CMakeParseArguments) set(options) From 248cb2fc97d5d1792b1c66e12805bada19c892b9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:35:15 +0000 Subject: [PATCH 2/3] Bump requests from 2.31.0 to 2.32.2 in the pip group across 1 directory Bumps the pip group with 1 update in the / directory: [requests](https://github.com/psf/requests). Updates `requests` from 2.31.0 to 2.32.2 - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.31.0...v2.32.2) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production dependency-group: pip ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9acb8b7bae5b2..a18359b8fb223 100644 --- a/poetry.lock +++ b/poetry.lock @@ -682,13 +682,13 @@ files = [ [[package]] name = "requests" -version = "2.31.0" +version = "2.32.2" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"}, + {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"}, ] [package.dependencies] From c9f3add64959dc71146042f726b4101f75d9cab0 Mon Sep 17 00:00:00 2001 From: apicalshark <58538165+apicalshark@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:50:58 +0800 Subject: [PATCH 3/3] Temp (#15) * metal : fix minor string leaks (ggml/1004) * cmake : make it possible linking ggml as external lib (ggml/1003) * sync : ggml * CANN: adjust backend registry refactor. (#10158) remove buffer->iface.get_name that used in cann as it was removed in backend registry refactor PR. * metal : move dequantize templates to beginning of MSL source (#0) * metal : simplify f16 and f32 dequant kernels (#0) * cuda : clear error after changing peer access (#10153) * fix build break on arm64 linux (#10166) This fixes the build break from the recent changes to move the CPU backend to separate files https://github.com/ggerganov/llama.cpp/pull/10144 * server : clarify /slots endpoint, add is_processing (#10162) * server : clarify /slots endpoint, add is_processing * fix tests * ggml : fix q4xx mat mul, increase ggml_aligned_malloc alignment (#10167) * ggml : fix gelu tables initialization (#10172) * Q6_K AVX improvements (#10118) * q6_k instruction reordering attempt * better subtract method * should be theoretically faster small improvement with shuffle lut, likely because all loads are already done at that stage * optimize bit fiddling * handle -32 offset separately. bsums exists for a reason! * use shift * Update ggml-quants.c * have to update ci macos version to 13 as 12 doesnt work now. 13 is still x86 * ggml : fix arch check in bf16_to_fp32 (#10164) * llama : add <|tool_call|> formatting to Granite template (#10177) Branch: GraniteToolCallTemplate Signed-off-by: Gabe Goodhart * metal : add quantized FA support (#10149) * metal : add quantized FA (vec) support ggml-ci * metal : add quantized FA (non-vec) support * metal : fix support check ggml-ci * metal : clean-up * metal : clean-up (cont) * metal : fix shared memory calc + reduce smem + comments * metal : float-correctness * metal : minor [no ci] * ggml : adjust is_first_call init value (#10193) ggml-ci * metal : fix from ptr buffer name (#10189) * server : remove hack for extra parallel slot (#10187) ggml-ci * metal : add BF16 support (#8439) * ggml : add initial BF16 support ggml-ci * metal : add mul_mat_id BF16 support ggml-ci * metal : check for bfloat support on the Metal device ggml-ci * metal : better var names [no ci] * metal : do not build bfloat kernels when not supported ggml-ci * metal : try to fix BF16 support check ggml-ci * metal : this should correctly check bfloat support --------- Signed-off-by: Gabe Goodhart Co-authored-by: Plamen Minev Co-authored-by: Yuri Khrustalev Co-authored-by: Georgi Gerganov Co-authored-by: leo-pony Co-authored-by: Diego Devesa Co-authored-by: snadampal <87143774+snadampal@users.noreply.github.com> Co-authored-by: Xuan Son Nguyen Co-authored-by: Eve <139727413+netrunnereve@users.noreply.github.com> Co-authored-by: Gabe Goodhart --- .github/workflows/build.yml | 1 + common/common.cpp | 3 + examples/server/README.md | 11 +- examples/server/server.cpp | 69 +- examples/server/tests/features/steps/steps.py | 10 +- ggml/src/CMakeLists.txt | 2 +- ggml/src/ggml-cann.cpp | 1 - ggml/src/ggml-cpu.c | 19 +- ggml/src/ggml-cuda.cu | 6 + ggml/src/ggml-metal.m | 668 +++-- ggml/src/ggml-metal.metal | 2334 +++++++++-------- ggml/src/ggml-quants.c | 89 +- ggml/src/ggml.c | 18 +- scripts/sync-ggml.last | 2 +- src/llama.cpp | 9 +- tests/test-backend-ops.cpp | 2 +- 16 files changed, 1877 insertions(+), 1367 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 40ec8e738ff34..cea7de35aac03 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,6 +63,7 @@ env: jobs: + # CUDA Release ubuntu-latest-cmake: diff --git a/common/common.cpp b/common/common.cpp index c8cbaae11cf1c..19674af15fa7e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } + if (s == "bf16") { + return GGML_TYPE_BF16; + } if (s == "q8_0") { return GGML_TYPE_Q8_0; } diff --git a/examples/server/README.md b/examples/server/README.md index 1629e456b6836..15f95db1e06e1 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte ### GET `/slots`: Returns the current slots processing state -This endpoint can be disabled with `--no-slots` +> [!WARNING] +> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments. + +This endpoint is disabled by default and can be enabled with `--slots` If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots. @@ -709,6 +712,7 @@ Example: "grammar": "", "id": 0, "ignore_eos": false, + "is_processing": false, "logit_bias": [], "min_p": 0.05000000074505806, "mirostat": 0, @@ -741,7 +745,6 @@ Example: "temperature" ], "seed": 42, - "state": 1, "stop": [ "\n" ], @@ -755,10 +758,6 @@ Example: ] ``` -Possible values for `slot[i].state` are: -- `0`: SLOT_STATE_IDLE -- `1`: SLOT_STATE_PROCESSING - ### GET `/metrics`: Prometheus compatible metrics exporter This endpoint is only accessible if `--metrics` is set. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8531a784ded3d..1c7f0fd1dd1c3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -378,8 +378,8 @@ struct server_queue { std::condition_variable condition_tasks; // callback functions - std::function callback_new_task; - std::function callback_update_slots; + std::function callback_new_task; + std::function callback_update_slots; // Add a new task to the end of the queue int post(server_task task, bool front = false) { @@ -431,7 +431,7 @@ struct server_queue { } // Register function to process a new task - void on_new_task(std::function callback) { + void on_new_task(std::function callback) { callback_new_task = std::move(callback); } @@ -481,7 +481,7 @@ struct server_queue { lock.unlock(); QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(task); + callback_new_task(std::move(task)); } // all tasks in the current loop is processed, slots data is now ready @@ -644,17 +644,12 @@ struct server_context { bool load_model(const common_params & params_) { params = params_; - // reserve one extra sequence (seq_id == 0) for extra features - params.n_parallel += 1; - common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; loras = llama_init.lora_adapters; - params.n_parallel -= 1; // but be sneaky about it - if (model == nullptr) { SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); return false; @@ -1288,16 +1283,16 @@ struct server_context { void send_embedding(const server_slot & slot, const llama_batch & batch) { server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + res.id = slot.id_task; + res.error = false; + res.stop = true; const int n_embd = llama_n_embd(model); std::vector embd_res(n_embd, 0.0f); for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { continue; } @@ -1332,12 +1327,12 @@ struct server_context { void send_rerank(const server_slot & slot, const llama_batch & batch) { server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + res.id = slot.id_task; + res.error = false; + res.stop = true; for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { continue; } @@ -1510,7 +1505,7 @@ struct server_context { // Functions to process the task // - void process_single_task(const server_task & task) { + void process_single_task(server_task task) { switch (task.type) { case SERVER_TASK_TYPE_INFERENCE: { @@ -1566,11 +1561,11 @@ struct server_context { for (server_slot & slot : slots) { json slot_data = get_formated_generation(slot); - slot_data["id"] = slot.id; - slot_data["id_task"] = slot.id_task; - slot_data["state"] = slot.state; - slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); - slot_data["next_token"] = { + slot_data["id"] = slot.id; + slot_data["id_task"] = slot.id_task; + slot_data["is_processing"] = slot.is_processing(); + slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); + slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, @@ -1581,10 +1576,10 @@ struct server_context { {"stopping_word", slot.stopping_word}, }; - if (slot_data["state"] == SLOT_STATE_IDLE) { - n_idle_slots++; - } else { + if (slot.is_processing()) { n_processing_slots++; + } else { + n_idle_slots++; } slots_data.push_back(slot_data); @@ -1646,7 +1641,7 @@ struct server_context { std::string filename = task.data.at("filename"); std::string filepath = task.data.at("filepath"); - const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); + const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; @@ -1688,7 +1683,7 @@ struct server_context { slot->cache_tokens.resize(slot->n_ctx); size_t token_count = 0; - size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); + size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); if (nread == 0) { slot->cache_tokens.resize(0); send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); @@ -1731,7 +1726,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); server_task_result result; @@ -1808,8 +1803,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1836,7 +1831,7 @@ struct server_context { slot.i_batch = batch.n_tokens; - common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); + common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); slot.n_past += 1; @@ -1983,8 +1978,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2033,9 +2028,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -2048,7 +2043,7 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 2e418d8aa571b..687b163f487b6 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): match expected_slot_status_string: case 'idle': - expected_slot_status = 0 + expected_slot_status = False case 'busy': - expected_slot_status = 1 + expected_slot_status = True case _: assert False, "unknown status" - expected_slots = [{'id': slot_id, 'state': expected_slot_status} + expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status} for slot_id in range(context.n_slots)] await request_slots_status(context, expected_slots) @@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context, if status_code == 503 and status_code == expected_http_status_code: return if status_code == 200 and status_code == expected_http_status_code: - n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots) - n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots) + n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots) + n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots) if ((slots_idle is None or slots_idle == n_slots_idle) and (slots_processing is None or slots_processing == n_slots_processing)): return diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6a729f7dd815a..23e14f2834334 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1406,7 +1406,7 @@ if (EMSCRIPTEN) endif() target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) -target_include_directories(ggml PUBLIC ../include) +target_include_directories(ggml PUBLIC $ $) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index f8ac11e41ce21..776340881434d 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_cann_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; return buffer; diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index 4b8ffb629afbb..0cb5b824afc05 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -304,6 +304,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_Q8_0] = { + .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, #if defined (__ARM_FEATURE_MATMUL_INT8) @@ -13677,6 +13678,13 @@ int ggml_cpu_get_sve_cnt(void) { } void ggml_cpu_init(void) { + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + ggml_critical_section_start(); static bool is_first_call = true; @@ -13684,24 +13692,21 @@ void ggml_cpu_init(void) { if (is_first_call) { // initialize GELU, Quick GELU, SILU and EXP F32 tables { - // FIXME: this may be called before ggml_init - //const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); for (int i = 0; i < (1 << 16); ++i) { union { uint16_t u16; ggml_fp16_t fp16; } u = {i}; - // FIXME: this table is used in conversion functions outside of compute - // current code depends on ggml_init initializing this table - float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + float f = GGML_FP16_TO_FP32(u.fp16); ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } - //const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - //GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); + GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); } #if defined(__ARM_ARCH) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b57f1b3b79469..e68e405501bed 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0); if (err != cudaErrorPeerAccessAlreadyEnabled) { CUDA_CHECK(err); + } else { + // reset the error + cudaGetLastError(); } } else { cudaError_t err = cudaDeviceDisablePeerAccess(id_other); if (err != cudaErrorPeerAccessNotEnabled) { CUDA_CHECK(err); + } else { + // reset the error + cudaGetLastError(); } } } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index a2b4d49d583aa..f13adee384350 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -36,16 +36,18 @@ id mtl_device; int mtl_device_ref_count; - bool support_simdgroup_reduction; - bool support_simdgroup_mm; + bool has_simdgroup_reduction; + bool has_simdgroup_mm; + bool has_bfloat; char name[128]; } g_ggml_ctx_dev_main = { - /*.mtl_device =*/ nil, - /*.mtl_device_ref_count =*/ 0, - /*.support_simdgroup_reduction =*/ false, - /*.support_simdgroup_mm =*/ false, - /*.name =*/ "", + /*.mtl_device =*/ nil, + /*.mtl_device_ref_count =*/ 0, + /*.has_simdgroup_reduction =*/ false, + /*.has_simdgroup_mm =*/ false, + /*.has_bfloat =*/ false, + /*.name =*/ "", }; // acquire @@ -55,10 +57,13 @@ if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); - ctx->support_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; - ctx->support_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; - ctx->support_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + + ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } @@ -120,6 +125,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, + GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, @@ -146,10 +152,14 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, @@ -170,10 +180,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, - //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, + //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, @@ -195,6 +206,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, @@ -216,6 +228,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, @@ -255,13 +268,56 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, - //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, - //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, + GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, GGML_METAL_KERNEL_TYPE_CPY_F16_F16, GGML_METAL_KERNEL_TYPE_CPY_F16_F32, + GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, + GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, @@ -440,7 +496,7 @@ @implementation GGMLMetalClass // dictionary of preprocessor macros NSMutableDictionary * prep = [NSMutableDictionary dictionary]; - MTLCompileOptions* options = [MTLCompileOptions new]; + MTLCompileOptions * options = [MTLCompileOptions new]; options.preprocessorMacros = prep; //[options setFastMathEnabled:false]; @@ -450,7 +506,14 @@ @implementation GGMLMetalClass GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } + +#if !__has_feature(objc_arc) + [options release]; +#endif } +#if GGML_METAL_EMBED_LIBRARY + [src release]; +#endif // GGML_METAL_EMBED_LIBRARY } } @@ -483,9 +546,10 @@ @implementation GGMLMetalClass } } - GGML_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx_dev->support_simdgroup_reduction ? "true" : "false"); - GGML_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx_dev->support_simdgroup_mm ? "true" : "false"); - GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); + GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); + GGML_LOG_INFO("%s: bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; ctx->capture_started = false; @@ -531,8 +595,9 @@ @implementation GGMLMetalClass GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } - const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; - const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; + const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; + const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; + const bool has_bfloat = ctx_dev->has_bfloat; // simd_sum and simd_max requires MTLGPUFamilyApple7 @@ -560,14 +625,15 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16, get_rows_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); @@ -588,101 +654,108 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, has_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32, mul_mv_id_bf16_f32, has_simdgroup_reduction && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, mul_mm_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32, mul_mm_id_bf16_f32, has_simdgroup_mm && has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true); @@ -698,18 +771,61 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, support_simdgroup_mm); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64, flash_attn_ext_q4_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80, flash_attn_ext_q4_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96, flash_attn_ext_q4_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112, flash_attn_ext_q4_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128, flash_attn_ext_q4_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256, flash_attn_ext_q4_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64, flash_attn_ext_q4_1_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80, flash_attn_ext_q4_1_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96, flash_attn_ext_q4_1_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112, flash_attn_ext_q4_1_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128, flash_attn_ext_q4_1_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256, flash_attn_ext_q4_1_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64, flash_attn_ext_q5_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80, flash_attn_ext_q5_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96, flash_attn_ext_q5_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112, flash_attn_ext_q5_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128, flash_attn_ext_q5_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256, flash_attn_ext_q5_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64, flash_attn_ext_q5_1_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80, flash_attn_ext_q5_1_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96, flash_attn_ext_q5_1_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112, flash_attn_ext_q5_1_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128, flash_attn_ext_q5_1_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256, flash_attn_ext_q5_1_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64, flash_attn_ext_q8_0_h64, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80, flash_attn_ext_q8_0_h80, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96, flash_attn_ext_q8_0_h96, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112, flash_attn_ext_q8_0_h112, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128, flash_attn_ext_q8_0_h128, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128, flash_attn_ext_vec_q4_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128, flash_attn_ext_vec_q4_1_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128, flash_attn_ext_vec_q5_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128, flash_attn_ext_vec_q5_1_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128, flash_attn_ext_vec_q8_0_h128, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256, flash_attn_ext_vec_q4_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256, flash_attn_ext_vec_q4_1_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32, cpy_bf16_f32, has_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16, cpy_bf16_bf16, has_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); @@ -799,15 +915,18 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { } static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) { - for (size_t i = 0, n = 3; i < n; ++i) { - if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { - return false; + const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; + const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction; + const bool has_bfloat = ctx_dev->has_bfloat; + + if (!has_bfloat) { + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { + return false; + } } } - const bool support_simdgroup_mm = ctx_dev->support_simdgroup_mm; - const bool support_simdgroup_reduction = ctx_dev->support_simdgroup_reduction; - switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -845,7 +964,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_SOFT_MAX: case GGML_OP_RMS_NORM: case GGML_OP_GROUP_NORM: - return support_simdgroup_reduction; + return has_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ROPE: return true; @@ -862,22 +981,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_LEAKY_RELU: return true; case GGML_OP_FLASH_ATTN_EXT: - if (op->src[1]->type != GGML_TYPE_F16) { - return false; - } - if (op->src[2]->type != GGML_TYPE_F16) { + if (op->src[1]->type != op->src[2]->type) { return false; } - if (op->src[0]->ne[0] == 256) { - return false; - } - return support_simdgroup_mm; // TODO: over-restricted for vec-kernels + return has_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - return support_simdgroup_reduction && + return has_simdgroup_reduction && (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32); case GGML_OP_CPY: case GGML_OP_DUP: @@ -888,6 +1001,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex switch (op->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -900,10 +1014,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } case GGML_TYPE_F16: switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_F16: return true; - default: + default: + return false; + } + case GGML_TYPE_BF16: + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_BF16: + return true; + default: return false; } default: @@ -1774,6 +1896,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; default: break; } @@ -1782,6 +1905,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; @@ -1859,6 +1983,25 @@ static void ggml_metal_encode_node( nrows = 4; } } break; + case GGML_TYPE_BF16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; + nrows = 4; + } + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; + nrows = 4; + } + } break; case GGML_TYPE_Q4_0: { nth0 = 8; @@ -2077,12 +2220,12 @@ static void ggml_metal_encode_node( if ([device supportsFamily:MTLGPUFamilyApple7] && ne00 % 32 == 0 && ne00 >= 64 && dst_rows > dst_rows_min) { - // some Metal matrix data types require aligned pointers // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; default: break; } @@ -2091,6 +2234,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; @@ -2160,6 +2304,13 @@ static void ggml_metal_encode_node( nth1 = 1; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; } break; + case GGML_TYPE_BF16: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32].pipeline; + } break; case GGML_TYPE_Q4_0: { nth0 = 8; @@ -2357,6 +2508,7 @@ static void ggml_metal_encode_node( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; @@ -2815,6 +2967,7 @@ static void ggml_metal_encode_node( GGML_ASSERT(ne11 % 32 == 0); GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == src2->type); GGML_ASSERT(ggml_are_same_shape (src1, src2)); @@ -2862,26 +3015,154 @@ static void ggml_metal_encode_node( bool use_vec_kernel = false; if (ne01 >= 4 || (ne00%128 != 0)) { - switch (ne00) { - case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; - case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; - case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; - case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + switch (src1->type) { + case GGML_TYPE_F16: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q4_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q4_1: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q5_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q5_1: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; + case GGML_TYPE_Q8_0: + { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_LOG_ERROR("add template specialization for this size\n"); + GGML_ABORT("add template specialization for this size"); + } + } + } break; default: - { - GGML_LOG_ERROR("unsupported size: %lld\n", ne00); - GGML_LOG_ERROR("add template specialization for this size\n"); - GGML_ABORT("add template specialization for this size"); - } + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } } } else { use_vec_kernel = true; switch (ne00) { - case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + case 128: + { + switch (src1->type) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } + } + } break; + case 256: + { + switch (src1->type) { + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256].pipeline; break; + default: + { + GGML_LOG_ERROR("unsupported type: %d\n", src1->type); + GGML_LOG_ERROR("add template specialization for this type\n"); + GGML_ABORT("add template specialization for this type"); + } + } + } break; default: { GGML_LOG_ERROR("unsupported size: %lld\n", ne00); @@ -2935,10 +3216,16 @@ static void ggml_metal_encode_node( GGML_ASSERT(nqptg % 8 == 0); GGML_ASSERT(ncpsg % 32 == 0); + // 16*32*(nsg) + // the shared memory needed for the simdgroups to load the KV cache + // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG + // +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16)) + int64_t nsgmax = 2; while (true) { - const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsgmax); if (smem > device.maxThreadgroupMemoryLength) { break; } @@ -2949,16 +3236,15 @@ static void ggml_metal_encode_node( // simdgroups per threadgroup (a.k.a. warps) const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; - const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsg); - //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg); GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); - - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - + [encoder setThreadgroupMemoryLength:smem atIndex:0]; +#undef FATTN_SMEM [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; } else { - // half1x4 kernel + // half4x4 kernel const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! @@ -2966,8 +3252,28 @@ static void ggml_metal_encode_node( GGML_ASSERT(nqptg % 1 == 0); GGML_ASSERT(ncpsg % 32 == 0); + // ne00 + 2*ncpsg*(nsg) + // for each query, we load it as f16 in shared memory (ne00) + // and store the attention scores (nqptg x ncpsg) as f32 + // + // 2*ne00*(nsg) + // each simdgroup has a full f32 head vector in shared mem to accumulate results + // +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + 2*ne00*(nsg))*(sizeof(float)/2), 16)) + + int64_t nsgmax = 2; + + while (true) { + const size_t smem = FATTN_SMEM(nsgmax); + if (smem > device.maxThreadgroupMemoryLength) { + break; + } + nsgmax *= 2; + } + nsgmax /= 2; + // simdgroups per threadgroup (a.k.a. warps) - const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); + const int64_t nsgt = MAX(2, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))); int64_t nsg = 1; while (nsg <= nsgt) { @@ -2975,12 +3281,12 @@ static void ggml_metal_encode_node( } nsg /= 2; - const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); + const size_t smem = FATTN_SMEM(nsg); - //printf("smem: %zu, max: %zu\n", smem, device.maxThreadgroupMemoryLength); + //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg); GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); - [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; - + [encoder setThreadgroupMemoryLength:smem atIndex:0]; +#undef FATTN_SMEM [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; } } break; @@ -3002,6 +3308,7 @@ static void ggml_metal_encode_node( switch (dstt) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_BF16].pipeline; break; case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; @@ -3019,6 +3326,14 @@ static void ggml_metal_encode_node( default: GGML_ABORT("not implemented"); }; } break; + case GGML_TYPE_BF16: + { + switch (dstt) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; default: GGML_ABORT("not implemented"); } @@ -3837,7 +4152,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } } - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { @@ -3847,7 +4162,8 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; UNUSED(dev); } diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 57eb34f13ac85..16b5da3ff3f02 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -12,435 +12,469 @@ using namespace metal; #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 -enum ggml_sort_order { - GGML_SORT_ORDER_ASC, - GGML_SORT_ORDER_DESC, -}; +// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf +// +// cmd: +// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal.metal +// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal.metal +// +#if __METAL_VERSION__ < 310 +#define GGML_METAL_NO_BFLOAT +#endif -// general-purpose kernel for addition, subtraction, multiplication and division of two tensors -// pros: works for non-contiguous tensors, supports broadcast across all dims -// cons: not very efficient -kernel void kernel_add( - device const char * src0, - device const char * src1, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - constant int64_t & offs, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig.z; - const int64_t i02 = tgpig.y; - const int64_t i01 = tgpig.x; +#if !defined(GGML_METAL_NO_BFLOAT) +typedef matrix bfloat4x4; +#endif - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; +constexpr constant static float kvalues_iq4nl_f[16] = { + -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f +}; - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs; +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - const int i10 = i0 % ne10; - *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10)); - } +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); } -kernel void kernel_sub( - device const char * src0, - device const char * src1, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - constant int64_t & offs, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig.z; - const int64_t i02 = tgpig.y; - const int64_t i01 = tgpig.x; +#if !defined(GGML_METAL_NO_BFLOAT) +template +void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { + reg = (type4x4)(*src); +} +#endif - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs; + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + } +} - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - const int i10 = i0 % ne10; - *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10)); +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; } } -kernel void kernel_mul( - device const char * src0, - device const char * src1, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig.z; - const int64_t i02 = tgpig.y; - const int64_t i01 = tgpig.x; +template +void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 3); + const float d = xb->d; + const float md = -16.h * xb->d; + const ushort mask = il ? 0x00F0 : 0x000F; - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; + const uint32_t qh = *((device const uint32_t *)xb->qh); - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + const int x_mv = il ? 4 : 0; - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - const int i10 = i0 % ne10; - *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10)); + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; + + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; + + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); + + reg[i/2][2*(i%2)+0] = d * x0 + md; + reg[i/2][2*(i%2)+1] = d * x1 + md; } } -kernel void kernel_div( - device const char * src0, - device const char * src1, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig.z; - const int64_t i02 = tgpig.y; - const int64_t i01 = tgpig.x; +template +void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 4); + const float d = xb->d; + const float m = xb->m; + const ushort mask = il ? 0x00F0 : 0x000F; - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; + const uint32_t qh = *((device const uint32_t *)xb->qh); - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; - device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; - device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + const int x_mv = il ? 4 : 0; - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - const int i10 = i0 % ne10; - *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10)); - } -} + const int gh_mv = il ? 12 : 0; + const int gh_bk = il ? 0 : 4; -template -kernel void kernel_repeat( - device const char * src0, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3 % ne03; - const int64_t i02 = i2 % ne02; - const int64_t i01 = i1 % ne01; + for (int i = 0; i < 8; i++) { + // extract the 5-th bits for x0 and x1 + const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; + const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; - device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ; + // combine the 4-bits from qs with the 5th bit + const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); + const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - const int i00 = i0 % ne00; - *((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00)); + reg[i/2][2*(i%2)+0] = d * x0 + m; + reg[i/2][2*(i%2)+1] = d * x1 + m; } } -typedef decltype(kernel_repeat) kernel_repeat_t; - -template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; -template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; -template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; -template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; - -// assumption: src1 is a row -// broadcast src1 into src0 -kernel void kernel_add_row( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - constant uint64_t & nb [[buffer(28)]], - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig % nb]; -} +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const half d = xb->d; -kernel void kernel_sub_row( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - constant uint64_t & nb [[buffer(28)]], - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] - src1[tpig % nb]; + for (int i = 0; i < 16; i++) { + reg[i/4][i%4] = (qs[i + 16*il] * d); + } } -kernel void kernel_mul_row( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - constant uint64_t & nb [[buffer(28)]], - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % nb]; -} +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const float d = xb->d; + const float min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + float dl, ml; + uint8_t sc = xb->scales[il]; -kernel void kernel_div_row( - device const float4 * src0, - device const float4 * src1, - device float4 * dst, - constant uint64_t & nb [[buffer(28)]], - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] / src1[tpig % nb]; -} + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; -kernel void kernel_scale( - device const float * src0, - device float * dst, - constant float & scale, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * scale; + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } } -kernel void kernel_scale_4( - device const float4 * src0, - device float4 * dst, - constant float & scale, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * scale; -} +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; -kernel void kernel_clamp( - device const float * src0, - device float * dst, - constant float & min, - constant float & max, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]); -} + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) + : (scale_2&kmask2) | ((scale_1&kmask1) << 4); + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + const float ml = 4.f * dl; -kernel void kernel_relu( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = max(0.0f, src0[tpig]); -} + il = (il/2) & 3; + const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl *= coef; -kernel void kernel_sigmoid( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig])); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); + } } -kernel void kernel_tanh( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - device const float & x = src0[tpig]; - dst[tpig] = precise::tanh(x); +static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { + return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} + : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; } -constant float GELU_COEF_A = 0.044715f; -constant float GELU_QUICK_COEF = -1.702f; -constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +template +void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { + device const uchar * q = xb->qs; -kernel void kernel_gelu( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - device const float & x = src0[tpig]; + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.h; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; - dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } } -kernel void kernel_gelu_4( - device const float4 * src0, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - device const float4 & x = src0[tpig]; - - // BEWARE !!! - // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! - // This was observed with Falcon 7B and 40B models - // - dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); -} +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; -kernel void kernel_gelu_quick( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - device const float & x = src0[tpig]; + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const float d = il < 2 ? xb->d : xb->d / 16.f; + const float min = xb->dmin; + const float dl = d * sc[0]; + const float ml = min * sc[1]; - dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); + const ushort mask = il<2 ? 0x0F : 0xF0; + const float qh_val = il<2 ? 16.f : 256.f; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } } -kernel void kernel_gelu_quick_4( - device const float4 * src0, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - device const float4 & x = src0[tpig]; - - dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); -} +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * ql = (device const uint8_t *)xb->ql; + device const uint8_t * qh = (device const uint8_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; -kernel void kernel_silu( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - device const float & x = src0[tpig]; - dst[tpig] = x / (1.0f + exp(-x)); -} + ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + qh = qh + 32*(il/8) + 16*(il&1); + float sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; -kernel void kernel_silu_4( - device const float4 * src0, - device float4 * dst, - uint tpig[[thread_position_in_grid]]) { - device const float4 & x = src0[tpig]; - dst[tpig] = x / (1.0f + exp(-x)); + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const float coef = il>1 ? 1.f/16.f : 1.f; + const float ml = d_all * sc * 32.f; + const float dl = d_all * sc * coef; + for (int i = 0; i < 16; ++i) { + const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) + : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); + reg[i/4][i%4] = dl * q - ml; + } } -kernel void kernel_sqr( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src0[tpig]; +template +void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. + device const uint16_t * q2 = xb->qs + 4*ib32; + const uint32_t aux32_g = q2[0] | (q2[1] << 16); + const uint32_t aux32_s = q2[2] | (q2[3] << 16); + thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; + const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } } -kernel void kernel_sqrt( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = sqrt(src0[tpig]); +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } } -kernel void kernel_sin( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = sin(src0[tpig]); +template +void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * q3 = xb->qs + 8*ib32; + device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); + uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } + grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); + grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); + signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } } -kernel void kernel_cos( - device const float * src0, - device float * dst, - uint tpig[[thread_position_in_grid]]) { - dst[tpig] = cos(src0[tpig]); +template +void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 8*ib32; + device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); + constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); + reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); + } + grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); + reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); + } } -kernel void kernel_sum_rows( - device const float * src0, - device float * dst, +template +void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * signs = qs + QK_K/8; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); + constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); + for (int i = 0; i < 8; ++i) { + reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); + reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); + } +} + +template +void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + const float d = xb->d; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint16_t * qh = xb->qh; + const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); + const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); + const uint16_t h = qh[ib32] >> 6*il; + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml; + reg[1][i] = dl * (grid1[i] >> 4) + ml; + reg[2][i] = dl * (grid2[i] & 0xf) + ml; + reg[3][i] = dl * (grid2[i] >> 4) + ml; + } +} + +template +void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + device const uint16_t * sc = (device const uint16_t *)xb->scales; + + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = scale.f16; + + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * qh = xb->qh + 2*ib32 + il; + + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); + const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml1; + reg[1][i] = dl * (grid1[i] >> 4) + ml1; + reg[2][i] = dl * (grid2[i] & 0xf) + ml2; + reg[3][i] = dl * (grid2[i] >> 4) + ml2; + } +} + +template +void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { + device const uint16_t * q4 = (device const uint16_t *)xb->qs; + const float d = xb->d; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + +template +void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; + const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)xb->d * (ls - 32); + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + for (int i = 0; i < 4; ++i) { + aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; + reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; + reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; + reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; + reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; + } +} + +enum ggml_sort_order { + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, +}; + +// general-purpose kernel for addition, subtraction, multiplication and division of two tensors +// pros: works for non-contiguous tensors, supports broadcast across all dims +// cons: not very efficient +kernel void kernel_add( + device const char * src0, + device const char * src1, + device char * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -465,132 +499,446 @@ kernel void kernel_sum_rows( constant uint64_t & nb1, constant uint64_t & nb2, constant uint64_t & nb3, - uint3 tpig[[thread_position_in_grid]]) { - int64_t i3 = tpig.z; - int64_t i2 = tpig.y; - int64_t i1 = tpig.x; + constant int64_t & offs, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; - if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) { - return; - } - - device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03); - device float * dst_row = (device float *) ((device char *) dst + i1*nb1 + i2*nb2 + i3*nb3); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; - float row_sum = 0; + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs; - for (int64_t i0 = 0; i0 < ne00; i0++) { - row_sum += src_row[i0]; + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10)); } - - dst_row[0] = row_sum; } -template -kernel void kernel_soft_max( - device const char * src0, - device const char * src1, - device char * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant float & scale, - constant float & max_bias, - constant float & m0, - constant float & m1, - constant uint32_t & n_head_log2, - threadgroup float * buf [[threadgroup(0)]], - uint tgpig[[threadgroup_position_in_grid]], - uint tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint ntg[[threads_per_threadgroup]]) { - const int64_t i03 = (tgpig) / (ne02*ne01); - const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; - const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); +kernel void kernel_sub( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int64_t & offs, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; - device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; - device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; - float slope = 1.0f; + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs; - // ALiBi - if (max_bias > 0.0f) { - const int64_t h = i02; + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10)); + } +} - const float base = h < n_head_log2 ? m0 : m1; - const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; +kernel void kernel_mul( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; - slope = pow(base, exp); - } + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; - // parallel max - float lmax = -INFINITY; + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10)); } +} - // find the max value in the block - float max_val = simd_max(lmax); - if (ntg > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = -INFINITY; - } - - threadgroup_barrier(mem_flags::mem_threadgroup); +kernel void kernel_div( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig.z; + const int64_t i02 = tgpig.y; + const int64_t i01 = tgpig.x; - if (tiisg == 0) { - buf[sgitg] = max_val; - } + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; - threadgroup_barrier(mem_flags::mem_threadgroup); + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; - max_val = buf[tiisg]; - max_val = simd_max(max_val); + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i10 = i0 % ne10; + *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10)); } +} - // parallel sum - float lsum = 0.0f; - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); - lsum += exp_psrc0; - pdst[i00] = exp_psrc0; - } +template +kernel void kernel_repeat( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; - // This barrier fixes a failing test - // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 - threadgroup_barrier(mem_flags::mem_none); + const int64_t i03 = i3 % ne03; + const int64_t i02 = i2 % ne02; + const int64_t i01 = i1 % ne01; - float sum = simd_sum(lsum); + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ; - if (ntg > N_SIMDWIDTH) { - if (sgitg == 0) { - buf[tiisg] = 0.0f; - } + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i00 = i0 % ne00; + *((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00)); + } +} - threadgroup_barrier(mem_flags::mem_threadgroup); +typedef decltype(kernel_repeat) kernel_repeat_t; - if (tiisg == 0) { - buf[sgitg] = sum; - } +template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; - threadgroup_barrier(mem_flags::mem_threadgroup); +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_add_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant uint64_t & nb [[buffer(28)]], + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig % nb]; +} - sum = buf[tiisg]; - sum = simd_sum(sum); +kernel void kernel_sub_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant uint64_t & nb [[buffer(28)]], + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] - src1[tpig % nb]; +} + +kernel void kernel_mul_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant uint64_t & nb [[buffer(28)]], + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * src1[tpig % nb]; +} + +kernel void kernel_div_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant uint64_t & nb [[buffer(28)]], + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] / src1[tpig % nb]; +} + +kernel void kernel_scale( + device const float * src0, + device float * dst, + constant float & scale, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * scale; +} + +kernel void kernel_scale_4( + device const float4 * src0, + device float4 * dst, + constant float & scale, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * scale; +} + +kernel void kernel_clamp( + device const float * src0, + device float * dst, + constant float & min, + constant float & max, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]); +} + +kernel void kernel_relu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = max(0.0f, src0[tpig]); +} + +kernel void kernel_sigmoid( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig])); +} + +kernel void kernel_tanh( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = precise::tanh(x); +} + +constant float GELU_COEF_A = 0.044715f; +constant float GELU_QUICK_COEF = -1.702f; +constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +kernel void kernel_gelu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + + dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +kernel void kernel_gelu_4( + device const float4 * src0, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + device const float4 & x = src0[tpig]; + + // BEWARE !!! + // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! + // This was observed with Falcon 7B and 40B models + // + dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +kernel void kernel_gelu_quick( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + + dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); +} + +kernel void kernel_gelu_quick_4( + device const float4 * src0, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + device const float4 & x = src0[tpig]; + + dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); +} + +kernel void kernel_silu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = x / (1.0f + exp(-x)); +} + +kernel void kernel_silu_4( + device const float4 * src0, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + device const float4 & x = src0[tpig]; + dst[tpig] = x / (1.0f + exp(-x)); +} + +kernel void kernel_sqr( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * src0[tpig]; +} + +kernel void kernel_sqrt( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = sqrt(src0[tpig]); +} + +kernel void kernel_sin( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = sin(src0[tpig]); +} + +kernel void kernel_cos( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = cos(src0[tpig]); +} + +kernel void kernel_sum_rows( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tpig[[thread_position_in_grid]]) { + int64_t i3 = tpig.z; + int64_t i2 = tpig.y; + int64_t i1 = tpig.x; + + if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) { + return; } - const float inv_sum = 1.0f/sum; + device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03); + device float * dst_row = (device float *) ((device char *) dst + i1*nb1 + i2*nb2 + i3*nb3); - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - pdst[i00] *= inv_sum; + float row_sum = 0; + + for (int64_t i0 = 0; i0 < ne00; i0++) { + row_sum += src_row[i0]; } + + dst_row[0] = row_sum; } template -kernel void kernel_soft_max_4( +kernel void kernel_soft_max( device const char * src0, device const char * src1, device char * dst, @@ -612,12 +960,13 @@ kernel void kernel_soft_max_4( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; - device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; - device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; + device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; + device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); float slope = 1.0f; + // ALiBi if (max_bias > 0.0f) { const int64_t h = i02; @@ -628,14 +977,13 @@ kernel void kernel_soft_max_4( } // parallel max - float4 lmax4 = -INFINITY; + float lmax = -INFINITY; - for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); } - const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - + // find the max value in the block float max_val = simd_max(lmax); if (ntg > N_SIMDWIDTH) { if (sgitg == 0) { @@ -655,15 +1003,118 @@ kernel void kernel_soft_max_4( } // parallel sum - float4 lsum4 = 0.0f; - for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); - lsum4 += exp_psrc4; - pdst4[i00] = exp_psrc4; + float lsum = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); + lsum += exp_psrc0; + pdst[i00] = exp_psrc0; } - const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - + // This barrier fixes a failing test + // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 + threadgroup_barrier(mem_flags::mem_none); + + float sum = simd_sum(lsum); + + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = 0.0f; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sum = buf[tiisg]; + sum = simd_sum(sum); + } + + const float inv_sum = 1.0f/sum; + + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + pdst[i00] *= inv_sum; + } +} + +template +kernel void kernel_soft_max_4( + device const char * src0, + device const char * src1, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant float & scale, + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + const int64_t i03 = (tgpig) / (ne02*ne01); + const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; + const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); + + device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; + device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; + device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; + + float slope = 1.0f; + + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + + // parallel max + float4 lmax4 = -INFINITY; + + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); + } + + const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); + + float max_val = simd_max(lmax); + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + buf[tiisg] = -INFINITY; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + buf[sgitg] = max_val; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = buf[tiisg]; + max_val = simd_max(max_val); + } + + // parallel sum + float4 lsum4 = 0.0f; + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); + lsum4 += exp_psrc4; + pdst4[i00] = exp_psrc4; + } + + const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + // This barrier fixes a failing test // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 threadgroup_barrier(mem_flags::mem_none); @@ -1611,6 +2062,10 @@ typedef decltype(kernel_mul_mv) mul_mv_t; template [[host_name("kernel_mul_mv_f32_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f32")]] kernel mul_mv_t kernel_mul_mv; template [[host_name("kernel_mul_mv_f16_f16")]] kernel mul_mv_t kernel_mul_mv; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv; +template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; +#endif template kernel void kernel_mul_mv_1row( @@ -1680,6 +2135,9 @@ kernel void kernel_mul_mv_1row( typedef decltype(kernel_mul_mv_1row) mul_mv_1row_t; template [[host_name("kernel_mul_mv_f16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row; +#endif // Assumes row size (ne00) is a multiple of 4 template @@ -1739,6 +2197,9 @@ kernel void kernel_mul_mv_l4( typedef decltype(kernel_mul_mv_l4) mul_mv_l4_t; template [[host_name("kernel_mul_mv_f16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4; +#endif static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / max(0.001f, high - low); @@ -2293,46 +2754,10 @@ kernel void kernel_leaky_relu_f32( dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope; } -typedef void (flash_attn_ext_f16_t)( - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device float * dst, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant uint64_t & nb13, - constant uint64_t & nb21, - constant uint64_t & nb22, - constant uint64_t & nb23, - constant uint64_t & nb31, - constant int64_t & ne1, - constant int64_t & ne2, - constant float & scale, - constant float & max_bias, - constant float & m0, - constant float & m1, - constant uint32_t & n_head_log2, - constant float & logit_softcap, - threadgroup half * shared, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]); - // ref: https://arxiv.org/pdf/2307.08691.pdf -template // head size, queries per threadgroup, cache items per threadgroup -kernel void kernel_flash_attn_ext_f16( +// D - head size, Q - queries per threadgroup, KV - key/value processed per each simdgroup, C - cache items per threadgroup +template +kernel void kernel_flash_attn_ext( device const char * q, device const char * k, device const char * v, @@ -2370,15 +2795,15 @@ kernel void kernel_flash_attn_ext_f16( ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups - const short iq3 = tgpig[2]; - const short iq2 = tgpig[1]; - const short iq1 = tgpig[0]*Q; + const int iq3 = tgpig[2]; + const int iq2 = tgpig[1]; + const int iq1 = tgpig[0]*Q; - const short D4 = D/4; - const short D8 = D/8; - //const short Q8 = Q/8; - const short NW = N_SIMDWIDTH; - const short SH = (C + Q); // shared memory per simdgroup in (half) + const short D4 = D/4; + const short D8 = D/8; + const short D16 = D/16; + const short NW = N_SIMDWIDTH; + const short SH = (C + Q); // shared memory per simdgroup in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half) const short TF = T/2; // shared memory size per query in (float) @@ -2388,6 +2813,9 @@ kernel void kernel_flash_attn_ext_f16( threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix + threadgroup half * skv = (threadgroup half *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory + threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4 + // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) simdgroup_half8x8 lo[D8]; @@ -2419,25 +2847,28 @@ kernel void kernel_flash_attn_ext_f16( threadgroup_barrier(mem_flags::mem_threadgroup); { - float S[Q] = { [0 ... Q-1] = 0.0h }; + float S[Q] = { [0 ... Q-1] = 0.0f }; float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 }; + // thread indices inside the simdgroup + const short tx = tiisg%4; + const short ty = tiisg/4; + // assume K and V are same shape const short ne22 = ne12; const short ne23 = ne13; - // broadcast + // broadcast k const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; - const short rv2 = ne02/ne22; - const short rv3 = ne03/ne23; - - // k indices const short ik2 = iq2/rk2; const short ik3 = iq3/rk3; - // v indices + // broadcast v + const short rv2 = ne02/ne22; + const short rv3 = ne03/ne23; + const short iv2 = iq2/rv2; const short iv3 = iq3/rv3; @@ -2476,13 +2907,59 @@ kernel void kernel_flash_attn_ext_f16( for (short cc = 0; cc < C/8; ++cc) { simdgroup_float8x8 mqk = make_filled_simdgroup_matrix(0.h); - device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); + // this is compile-time check, so it does not have runtime overhead + if (is_same::value) { + // we can read directly from global memory + device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); + + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mk; + simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose - for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mk; - simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); + } + } else { + for (short ii = 0; ii < D16; ii += 4) { + device const block_q * pk4 = (device const block_q *) ((device const char *) k + ((ic + 8*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); + + if (D16%4 == 0) { + // the head is evenly divisible by 4*16 = 64, so no need for bound checks + half4x4 tmp; + dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; - simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); + simdgroup_barrier(mem_flags::mem_threadgroup); + +#pragma unroll + for (short k = 0; k < 4; ++k) { + simdgroup_half8x8 mk; + + simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); + + simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); + } + } else { + if (ii + tx < D16) { + half4x4 tmp; + dequantize_func(pk4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < D16; ++k) { + simdgroup_half8x8 mk; + + simdgroup_load(mk, skv + 16*k + 0*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk); + + simdgroup_load(mk, skv + 16*k + 1*8, 4*16, 0, true); // transpose + simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk); + } + } + } } simdgroup_store(mqk, ss + 8*cc, TF, 0, false); @@ -2547,16 +3024,61 @@ kernel void kernel_flash_attn_ext_f16( // O = O + (Q*K^T)*V { for (short cc = 0; cc < C/8; ++cc) { - device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); + simdgroup_float8x8 ms; + simdgroup_load(ms, ss + 8*cc, TF, 0, false); + + if (is_same::value) { + // we can read directly from global memory + device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); +#pragma unroll + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mv; + simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false); + + simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]); + } + } else { + for (short ii = 0; ii < D16; ii += 4) { + device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 8*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); - for (short i = 0; i < D8; ++i) { - simdgroup_half8x8 mk; - simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false); + if (D16%4 == 0) { + // no need for bound checks + half4x4 tmp; + dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; - simdgroup_float8x8 mv; - simdgroup_load(mv, ss + 8*cc, TF, 0, false); + simdgroup_barrier(mem_flags::mem_threadgroup); - simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]); +#pragma unroll + for (short k = 0; k < 4; ++k) { + simdgroup_half8x8 mv; + + simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + + simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + } + } else { + if (ii + tx < D16) { + half4x4 tmp; + dequantize_func(pv4 + (ii + tx)/nl, (ii + tx)%nl, tmp); + skv4[4*ty + tx] = tmp; + } + + simdgroup_barrier(mem_flags::mem_threadgroup); + + for (short k = 0; k < 4 && ii + k < D16; ++k) { + simdgroup_half8x8 mv; + + simdgroup_load(mv, skv + 16*k + 0*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + + simdgroup_load(mv, skv + 16*k + 1*8, 4*16, 0, false); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + } + } + } } } } @@ -2573,7 +3095,7 @@ kernel void kernel_flash_attn_ext_f16( // reduce the warps sequentially for (short sg = 1; sg < nsg; ++sg) { - float S = { 0.0h }; + float S = { 0.0f }; float M = { -FLT_MAX/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -2652,15 +3174,54 @@ kernel void kernel_flash_attn_ext_f16( } } -template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>; -template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>; -template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>; -template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>; -template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>; -//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>; - -template // head size, queries per threadgroup, cache items per threadgroup -kernel void kernel_flash_attn_ext_vec_f16( +typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; + +template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; + +// NOTE: can use half instead of float precision for some extra perf +// D - head size, Q - queries per threadgroup, C - cache items per threadgroup +template +kernel void kernel_flash_attn_ext_vec( device const char * q, device const char * k, device const char * v, @@ -2698,36 +3259,27 @@ kernel void kernel_flash_attn_ext_vec_f16( ushort sgitg[[simdgroup_index_in_threadgroup]]) { const short nsg = ntg.y; // number of simdgroups - const short iq3 = tgpig[2]; - const short iq2 = tgpig[1]; - const short iq1 = tgpig[0]; + const int iq3 = tgpig[2]; + const int iq2 = tgpig[1]; + const int iq1 = tgpig[0]; - const short D4 = D/4; - const short NW = N_SIMDWIDTH; - const short SH = (C + Q); // shared memory per simdgroup in (half) + const short D4 = D/4; + const short D16 = D/16; + const short NW = N_SIMDWIDTH; + const short NW4 = NW/4; + const short SH = C; // shared memory per simdgroup in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half) - float slope = 1.0f; - - // ALiBi - if (max_bias > 0.0f) { - const uint32_t h = iq2; - - const float base = h < n_head_log2 ? m0 : m1; - const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - - slope = pow(base, exp); - } - - //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data - threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 - threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix - threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 - threadgroup half4 * sr4 = (threadgroup half4 *) (shared + sgitg*D + 1*T); // scratch buffer for the results + //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data + threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 + threadgroup half4x4 * sq44 = (threadgroup half4x4 *) (shared + 0*D); // same as above but in half4x4 + threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention + threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 + threadgroup float4x4 * sr44 = (threadgroup float4x4 *) (shared + 2*sgitg*D + Q*T); // scratch buffer for the results // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) - half4 lo[D4/NW]; + float4x4 lo[D16/NW4]; // load heads from Q to shared memory device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03)); @@ -2741,8 +3293,8 @@ kernel void kernel_flash_attn_ext_vec_f16( } // zero out lo - for (short i = tiisg; i < D4; i += NW) { - lo[i/NW] = 0.0h; + for (short i = 0; i < D16/NW4; i += NW4) { + lo[i] = float4x4(0.0f); } // zero out shared memory SH @@ -2753,38 +3305,52 @@ kernel void kernel_flash_attn_ext_vec_f16( threadgroup_barrier(mem_flags::mem_threadgroup); { - float S = { 0.0h }; - float M = { -FLT_MAX/2 }; + float S = 0.0f; + float M = -FLT_MAX/2; + + // thread indices inside the simdgroup + const short tx = tiisg%8; + const short ty = tiisg/8; // assume K and V are same shape const short ne22 = ne12; const short ne23 = ne13; - // broadcast + // broadcast k const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; + const short ik2 = iq2/rk2; + const short ik3 = iq3/rk3; + + // broadcast v const short rv2 = ne02/ne22; const short rv3 = ne03/ne23; - // k indices - const short ik2 = iq2 / rk2; - const short ik3 = iq3 / rk3; - - // v indices - const short iv2 = iq2 / rv2; - const short iv3 = iq3 / rv3; + const short iv2 = iq2/rv2; + const short iv3 = iq3/rv3; // load the queries from shared memory into local memory - float4 mq[D4/NW]; + float4x4 mq[D16/NW4]; - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - mq[ii/NW] = (float4) sq4[i]; + for (short ii = 0; ii < D16; ii += NW4) { + mq[ii/NW4] = (float4x4) sq44[ii + tx]; } // pointer to the mask - device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31); + device const half * mp = (device const half *) (mask + iq1*nb31); + + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = iq2; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } // loop over the KV cache // each simdgroup handles blocks of Q rows and C columns @@ -2796,47 +3362,54 @@ kernel void kernel_flash_attn_ext_vec_f16( // Q*K^T { -#pragma unroll + // each simdgroup processes 1 query and 4 keys for (short cc = 0; cc < C/4; ++cc) { - float4 mqk = { 0.0h }; + float mqk = 0.0; - device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13)); + device const block_q * pk = (device const block_q *) ((device const char *) k + ((ic + 4*cc + ty)*nb11 + ik2*nb12 + ik3*nb13)); #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - const short i = ii + tiisg; + for (short ii = 0; ii < D16; ii += NW4) { + const short i = ii + tx; float4x4 mk; - mk[0] = (float4) pk4[i + 0*(nb11/8)]; - mk[1] = (float4) pk4[i + 1*(nb11/8)]; - mk[2] = (float4) pk4[i + 2*(nb11/8)]; - mk[3] = (float4) pk4[i + 3*(nb11/8)]; + dequantize_func(pk + i/nl, i%nl, mk); - mqk += (float4) (mq[ii/NW] * mk); + mqk += + dot(mq[ii/NW4][0], mk[0]) + + dot(mq[ii/NW4][1], mk[1]) + + dot(mq[ii/NW4][2], mk[2]) + + dot(mq[ii/NW4][3], mk[3]); } - // reduce the results from the threads in the simdgroup - mqk += simd_shuffle_down(mqk, 16); - mqk += simd_shuffle_down(mqk, 8); + // simdgroup reduce + // [ 0 .. 7] -> [ 0] + // [ 8 .. 15] -> [ 8] + // [16 .. 23] -> [16] + // [24 .. 31] -> [24] + //mqk += simd_shuffle_down(mqk, 16); + //mqk += simd_shuffle_down(mqk, 8); mqk += simd_shuffle_down(mqk, 4); mqk += simd_shuffle_down(mqk, 2); mqk += simd_shuffle_down(mqk, 1); // mqk = mqk*scale + mask*slope - if (tiisg == 0) { + if (tx == 0) { mqk *= scale; if (logit_softcap != 0.0f) { mqk = logit_softcap*precise::tanh(mqk); } - mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f; + mqk += (mask != q) ? ((float) mp[ic + 4*cc + ty])*slope : (float) 0.0f; - ss4[cc] = mqk; + ss[4*cc + ty] = mqk; } } } + simdgroup_barrier(mem_flags::mem_threadgroup); + // online softmax { const short p = tiisg; @@ -2856,29 +3429,32 @@ kernel void kernel_flash_attn_ext_vec_f16( // O = diag(ms)*O #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - lo[ii/NW] *= ms; + for (short ii = 0; ii < D16; ii += NW4) { + lo[ii/NW4] *= ms; } } + simdgroup_barrier(mem_flags::mem_threadgroup); + // O = O + (Q*K^T)*V { #pragma unroll for (short cc = 0; cc < C/4; ++cc) { - device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23)); + device const block_q * pv4 = (device const block_q *) ((device const char *) v + ((ic + 4*cc + ty)*nb21 + iv2*nb22 + iv3*nb23)); + + const float4x4 lss(ss[4*cc + ty]); #pragma unroll - for (short ii = 0; ii < D4; ii += NW) { - const short i = ii + tiisg; + for (short ii = 0; ii < D16; ii += NW4) { + const short i = ii + tx; + + float4x4 mv; + dequantize_func(pv4 + i/nl, i%nl, mv); - lo[ii/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; - lo[ii/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; - lo[ii/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; - lo[ii/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; + lo[ii/NW4] += mv*lss; } } } - } // these are needed for reducing the results from the simdgroups (reuse the ss buffer) @@ -2888,10 +3464,32 @@ kernel void kernel_flash_attn_ext_vec_f16( } } + // simdgroup reduce + // [ 0, 8, 16, 24] -> [ 0] + // [ 1, 9, 17, 25] -> [ 1] + // [ 2, 10, 18, 26] -> [ 2] + // [ 3, 11, 19, 27] -> [ 3] + // [ 4, 12, 20, 28] -> [ 4] + // [ 5, 13, 21, 29] -> [ 5] + // [ 6, 14, 22, 30] -> [ 6] + // [ 7, 15, 23, 31] -> [ 7] + for (short ii = 0; ii < D16; ii += NW4) { + lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 16); + lo[ii/NW4][0] += simd_shuffle_down(lo[ii/NW4][0], 8); + + lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 16); + lo[ii/NW4][1] += simd_shuffle_down(lo[ii/NW4][1], 8); + + lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 16); + lo[ii/NW4][2] += simd_shuffle_down(lo[ii/NW4][2], 8); + + lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 16); + lo[ii/NW4][3] += simd_shuffle_down(lo[ii/NW4][3], 8); + } + // store results to shared memory - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - sr4[i] = lo[ii/NW]; + for (short i = tiisg; i < D16; i += NW4) { + sr44[i] = lo[i/NW4]; } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -2918,30 +3516,41 @@ kernel void kernel_flash_attn_ext_vec_f16( } // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1; + for (short i = tiisg; i < D16; i += NW) { + sr44[i] = sr44[i]*ms0 + sr44[i + r*D16]*ms1; } } threadgroup_barrier(mem_flags::mem_threadgroup); } - device float4 * dst4 = (device float4 *) dst; + device float4x4 * dst44 = (device float4x4 *) dst; // final rescale with 1/S and store to global memory if (sgitg == 0) { const float S = ss[0]; - for (short ii = 0; ii < D4; ii += NW) { - short i = ii + tiisg; - dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S; + for (short i = tiisg; i < D16; i += NW) { + dst44[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D16 + i] = sr44[i]/S; } } } -template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>; -//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>; +typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; + +template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; + +template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; +template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec; template kernel void kernel_cpy( @@ -2987,10 +3596,17 @@ kernel void kernel_cpy( typedef decltype(kernel_cpy) kernel_cpy_t; -template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy; -template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy; +#endif +template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy; +template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy; +#endif kernel void kernel_cpy_f32_q8_0( device const float * src0, @@ -3339,10 +3955,6 @@ static inline int best_index_int8(int n, constant float * val, float x) { return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } -constexpr constant static float kvalues_iq4nl_f[16] = { - -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f -}; - kernel void kernel_cpy_f32_iq4_nl( device const float * src0, device void * dst, @@ -5457,440 +6069,6 @@ kernel void kernel_mul_mv_iq4_xs_f32( kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } -//============================= templates and their specializations ============================= - -// NOTE: this is not dequantizing - we are simply fitting the template -template -void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - float4x4 temp = *(((device float4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} - -template -void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - half4x4 temp = *(((device half4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} - -template -void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; - } -} - -template -void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; - - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; - } -} - -template -void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 3); - const float d = xb->d; - const float md = -16.h * xb->d; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[i/2][2*(i%2)+0] = d * x0 + md; - reg[i/2][2*(i%2)+1] = d * x1 + md; - } -} - -template -void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 4); - const float d = xb->d; - const float m = xb->m; - const ushort mask = il ? 0x00F0 : 0x000F; - - const uint32_t qh = *((device const uint32_t *)xb->qh); - - const int x_mv = il ? 4 : 0; - - const int gh_mv = il ? 12 : 0; - const int gh_bk = il ? 0 : 4; - - for (int i = 0; i < 8; i++) { - // extract the 5-th bits for x0 and x1 - const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10; - const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10; - - // combine the 4-bits from qs with the 5th bit - const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0); - const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1); - - reg[i/2][2*(i%2)+0] = d * x0 + m; - reg[i/2][2*(i%2)+1] = d * x1 + m; - } -} - -template -void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { - device const int8_t * qs = ((device const int8_t *)xb->qs); - const half d = xb->d; - - for (int i = 0; i < 16; i++) { - reg[i/4][i%4] = (qs[i + 16*il] * d); - } -} - -template -void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { - const float d = xb->d; - const float min = xb->dmin; - device const uint8_t * q = (device const uint8_t *)xb->qs; - float dl, ml; - uint8_t sc = xb->scales[il]; - - q = q + 32*(il/8) + 16*(il&1); - il = (il/2)%4; - - half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * q = (device const uint8_t *)xb->qs; - device const uint8_t * h = (device const uint8_t *)xb->hmask; - device const int8_t * scales = (device const int8_t *)xb->scales; - - q = q + 32 * (il/8) + 16 * (il&1); - h = h + 16 * (il&1); - uint8_t m = 1 << (il/2); - uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ - ((il/4)>0 ? 12 : 3); - uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; - uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; - int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) - : (scale_2&kmask2) | ((scale_1&kmask1) << 4); - float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); - const float ml = 4.f * dl; - - il = (il/2) & 3; - const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - dl *= coef; - - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); - } -} - -static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { - return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} - : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; -} - -template -void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { - device const uchar * q = xb->qs; - - short is = (il/4) * 2; - q = q + (il/4) * 32 + 16 * (il&1); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.h; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il<2 ? 0x0F : 0xF0; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * (q[i] & mask) - ml; - } -} - -template -void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { - device const uint8_t * q = xb->qs; - device const uint8_t * qh = xb->qh; - - short is = (il/4) * 2; - q = q + 32 * (il/4) + 16 * (il&1); - qh = qh + 16 * (il&1); - uint8_t ul = 1 << (il/2); - il = il & 3; - const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.f; - const float min = xb->dmin; - const float dl = d * sc[0]; - const float ml = min * sc[1]; - - const ushort mask = il<2 ? 0x0F : 0xF0; - const float qh_val = il<2 ? 16.f : 256.f; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; - } -} - -template -void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { - const half d_all = xb->d; - device const uint8_t * ql = (device const uint8_t *)xb->ql; - device const uint8_t * qh = (device const uint8_t *)xb->qh; - device const int8_t * scales = (device const int8_t *)xb->scales; - - ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); - qh = qh + 32*(il/8) + 16*(il&1); - float sc = scales[(il%2) + 2 * ((il/2))]; - il = (il/2) & 3; - - const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; - const float coef = il>1 ? 1.f/16.f : 1.f; - const float ml = d_all * sc * 32.f; - const float dl = d_all * sc * coef; - for (int i = 0; i < 16; ++i) { - const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) - : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); - reg[i/4][i%4] = dl * q - ml; - } -} - -template -void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. - device const uint16_t * q2 = xb->qs + 4*ib32; - const uint32_t aux32_g = q2[0] | (q2[1] << 16); - const uint32_t aux32_s = q2[2] | (q2[3] << 16); - thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; - const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint16_t * q2 = xb->qs + 4*ib32; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); - uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } - grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); - signs = ksigns_iq2xs[q2[2*il+1] >> 9]; - for (int i = 0; i < 8; ++i) { - reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * q3 = xb->qs + 8*ib32; - device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; - constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); - constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); - uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } - grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); - grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); - signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); - reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); - } -} - -template -void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 8*ib32; - device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); - constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); - constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); - reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); - } - grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); - grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); - for (int i = 0; i < 4; ++i) { - reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); - reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); - } -} - -template -void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const float d = xb->d; - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * signs = qs + QK_K/8; - const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; - constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300))); - constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300))); - for (int i = 0; i < 8; ++i) { - reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]); - reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]); - } -} - -template -void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - const float d = xb->d; - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint16_t * qh = xb->qh; - const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1); - const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA); - const uint16_t h = qh[ib32] >> 6*il; - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml; - reg[1][i] = dl * (grid1[i] >> 4) + ml; - reg[2][i] = dl * (grid2[i] & 0xf) + ml; - reg[3][i] = dl * (grid2[i] >> 4) + ml; - } -} - -template -void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - device const uint16_t * sc = (device const uint16_t *)xb->scales; - - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const float d = scale.f16; - - device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; - device const uint8_t * qh = xb->qh + 2*ib32 + il; - - const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); - const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); - constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); - constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); - for (int i = 0; i < 4; ++i) { - reg[0][i] = dl * (grid1[i] & 0xf) + ml1; - reg[1][i] = dl * (grid1[i] >> 4) + ml1; - reg[2][i] = dl * (grid2[i] & 0xf) + ml2; - reg[3][i] = dl * (grid2[i] >> 4) + ml2; - } -} - -template -void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { - device const uint16_t * q4 = (device const uint16_t *)xb->qs; - const float d = xb->d; - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - -template -void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { - // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 - const int ib32 = il/2; - il = il%2; - // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 - device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32; - const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)xb->d * (ls - 32); - uint32_t aux32; - thread const uint8_t * q8 = (thread const uint8_t *)&aux32; - for (int i = 0; i < 4; ++i) { - aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f; - reg[i][0] = d * kvalues_iq4nl_f[q8[0]]; - reg[i][1] = d * kvalues_iq4nl_f[q8[1]]; - reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; - reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; - } -} - template kernel void kernel_get_rows_q( device const void * src0, @@ -6333,6 +6511,9 @@ typedef decltype(kernel_get_rows_f) get_rows_f_t; template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f; +#endif typedef decltype(kernel_get_rows_q) get_rows_q_t; @@ -6364,6 +6545,9 @@ typedef decltype(kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mat_mm_t kernel_mul_mm; +#endif template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -6392,6 +6576,9 @@ typedef decltype(kernel_mul_mm_id) mat_mm_id_t; template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +#endif template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -6615,6 +6802,9 @@ typedef decltype(kernel_mul_mv_id>>; template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#if !defined(GGML_METAL_NO_BFLOAT) +template [[host_name("kernel_mul_mv_id_bf16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +#endif template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7aa6dce8907f5..82a463f27ccc4 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4,7 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" - +#include "ggml-cpu.h" #include #include @@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined __AVX__ - const __m128i m4 = _mm_set1_epi8(0xF); const __m128i m3 = _mm_set1_epi8(3); - const __m128i m32s = _mm_set1_epi8(32); - const __m128i m2 = _mm_set1_epi8(2); + const __m128i m15 = _mm_set1_epi8(15); __m256 acc = _mm256_setzero_ps(); @@ -9119,12 +9117,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict qh = x[i].qh; const int8_t * restrict q8 = y[i].qs; + // handle the q6_k -32 offset separately using bsums + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1); const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales); + const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8)); + const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5); + const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5); __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); - __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + int is = 0; + for (int j = 0; j < QK_K/128; ++j) { const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; @@ -9132,26 +9138,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); - const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4); - const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4); - const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4); - const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4); - const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4); - const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4); + const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2); + const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2); + const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48)); + const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48)); + const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2); + const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2); const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0); - const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1); - const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2); - const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3); - const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4); - const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5); - const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6); - const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7); + const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0); + const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1); + const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2); + const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3); + const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4); + const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5); + const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6); + const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7); const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; @@ -9162,15 +9168,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0); - __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1); - __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2); - __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3); - __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4); - __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5); - __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6); - __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7); - __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); @@ -9180,32 +9177,20 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - p16_4 = _mm_sub_epi16(p16_4, q8s_4); - p16_5 = _mm_sub_epi16(p16_5, q8s_5); - p16_6 = _mm_sub_epi16(p16_6, q8s_6); - p16_7 = _mm_sub_epi16(p16_7, q8s_7); - - const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); - const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi8(shuffle, m2); + const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); + const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); + const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); + const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); + is += 4; p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); + p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1); p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); - p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); + p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3); p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); - p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5); + p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5); p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); - p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7); + p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); @@ -9214,8 +9199,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r } - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); + sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0); + sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1); + const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7dc3340a1e749..266a0d6f044db 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -220,8 +220,10 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi void * ggml_aligned_malloc(size_t size) { + const int alignment = 64; + #if defined(_MSC_VER) || defined(__MINGW32__) - return _aligned_malloc(size, TENSOR_ALIGNMENT); + return _aligned_malloc(size, alignment); #else if (size == 0) { GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); @@ -229,8 +231,9 @@ void * ggml_aligned_malloc(size_t size) { } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); + int result = hbw_posix_memalign(&aligned_memory, alignment, size); #elif TARGET_OS_OSX + GGML_UNUSED(alignment); kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); int result = EFAULT; switch (alloc_status) { @@ -248,7 +251,7 @@ void * ggml_aligned_malloc(size_t size) { break; } #else - int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); + int result = posix_memalign(&aligned_memory, alignment, size); #endif if (result != 0) { // Handle allocation failure @@ -392,6 +395,8 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { 16))); } } +#endif +#if defined(__AVX2__) if (ggml_cpu_has_avx2()) { for (; i + 8 <= n; i += 8) { _mm256_storeu_ps(y + i, @@ -1402,11 +1407,11 @@ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const str //////////////////////////////////////////////////////////////////////////////// struct ggml_context * ggml_init(struct ggml_init_params params) { - static bool is_first_call = false; + static bool is_first_call = true; ggml_critical_section_start(); - if (!is_first_call) { + if (is_first_call) { // initialize time system (required on Windows) ggml_time_init(); @@ -1417,7 +1422,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } u = {i}; ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); } - is_first_call = true; + + is_first_call = false; } ggml_critical_section_end(); diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 48863847c16f6..020c60f341c23 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -bb78a40dc60e04c626bac2b65840b509988e990d +a099cb514d6687e436a5a423d1fb0448be0feb20 diff --git a/src/llama.cpp b/src/llama.cpp index 3e563d811b77c..6719edb38808f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9134,7 +9134,7 @@ static bool llm_load_tensors( // print memory requirements per buffer type for (auto & buf : model.bufs) { - LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name @@ -21799,8 +21799,11 @@ static int32_t llama_chat_apply_template_internal( // IBM Granite template for (const auto & message : chat) { std::string role(message->role); - ss << "<|start_of_role|>" << role << "<|end_of_role|>" - << message->content << "<|end_of_text|>\n"; + ss << "<|start_of_role|>" << role << "<|end_of_role|>"; + if (role == "assistant_tool_call") { + ss << "<|tool_call|>"; + } + ss << message->content << "<|end_of_text|>\n"; } if (add_ass) { ss << "<|start_of_role|>assistant<|end_of_role|>\n"; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 46346cbd0bde6..6cc77edab2b62 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3599,7 +3599,7 @@ static std::vector> make_test_cases_eval() { for (int n_mats : {4}) { for (int n_used : {2}) { for (bool b : {false}) { - for (int n : {1}) { + for (int n : {1, 32}) { int m = 512; int k = 256; test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));