diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fda21c8..a175845 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,2 +1,11 @@ add_executable(quantize quantize.cpp) target_link_libraries(quantize PRIVATE ggml tts) + +if (NOT WIN32) + add_custom_target(quantize_snapshot_test.sh ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh) + add_custom_command(OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/quantize_snapshot_test.sh + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/quantize_snapshot_test.sh) + add_dependencies(quantize_snapshot_test.sh quantize) +endif () diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ecf09e8..2cad888 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,24 +1,29 @@ -#include "tts.h" -#include "args.h" #include #include -#include "ggml.h" +#include #include -std::vector valid_quantization_types = { - GGML_TYPE_F16, - GGML_TYPE_Q4_0, - GGML_TYPE_Q5_0, - GGML_TYPE_Q8_0, +#include "args.h" +#include "ggml.h" +#include "tts.h" + +const std::map valid_quantization_types = { + {"FP16", GGML_TYPE_F16}, + {"F16", GGML_TYPE_F16}, + {"Q4_0", GGML_TYPE_Q4_0}, + {"Q4", GGML_TYPE_Q4_0}, + {"Q5_0", GGML_TYPE_Q5_0}, + {"Q5", GGML_TYPE_Q5_0}, + {"Q8_0", GGML_TYPE_Q8_0}, + {"Q8", GGML_TYPE_Q8_0}, }; int main(int argc, const char ** argv) { - int default_quantization = (int) GGML_TYPE_Q4_0; int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1); arg_list args; args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini v1 to quantize.", "-mp", true)); args.add_argument(string_arg("--quantized-model-path", "(REQUIRED) The path to save the model in a quantized format.", "-qp", true)); - args.add_argument(int_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, &default_quantization)); + args.add_argument(string_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, "Q4_0")); args.add_argument(int_arg("--n-threads", "(OPTIONAL) The number of cpu threads to run the quantization process with. Defaults to known hardware concurrency.", "-nt", false, &default_n_threads)); args.add_argument(bool_arg("--convert-dac-to-f16", "(OPTIONAL) Whether to convert the DAC audio decoder model to a 16 bit float.", "-df")); args.add_argument(bool_arg("--quantize-output-heads", "(OPTIONAL) Whether to quantize the output heads. Defaults to false and is true when passed (does not accept a parameter).", "-qh")); @@ -31,12 +36,13 @@ int main(int argc, const char ** argv) { return 0; } args.validate(); - enum ggml_type qtype = static_cast(*args.get_int_param("--quantized-type")); - if (std::find(valid_quantization_types.begin(), valid_quantization_types.end(), qtype) == valid_quantization_types.end()) { - fprintf(stderr, "ERROR: %d is not a valid quantization type.\n", qtype); + std::string qtype = args.get_string_param("--quantized-type"); + if (!valid_quantization_types.contains(qtype)) { + fprintf(stderr, "ERROR: %s is not a valid quantization type.\n", + qtype.c_str()); exit(1); } - struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), qtype); + struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), valid_quantization_types.at(qtype)); qp->quantize_output_heads = args.get_bool_param("--quantize-output-heads"); qp->quantize_text_embeddings = args.get_bool_param("--quantize-text-embedding"); qp->quantize_cross_attn_kv = args.get_bool_param("--quantize-cross-attn-kv"); diff --git a/examples/quantize/quantize_snapshot_test.sh b/examples/quantize/quantize_snapshot_test.sh new file mode 100755 index 0000000..68b5f28 --- /dev/null +++ b/examples/quantize/quantize_snapshot_test.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -e +# Performs snapshot testing against mmwillet2's version on Hugging Face + +# To avoid redownloading models, cd to a persistent directory instead of build/bin +[ $# -eq 0 ] || { echo 'Usage: ./quantize_snapshot_test.sh'; exit 1; } +quantize="$(dirname "$0")"/quantize +[ -x "$quantize" ] || { echo 'quantize binary in script directory is not executable'; exit 1; } + +if [ ! -e gguf_py_venv ]; then + read -rp 'Path to llama.cpp (or blank): ' llamacpppath + if [ -z "$llamacpppath" ]; then + ln -s /dev/null gguf_py_venv + else + [ -f "$llamacpppath"/gguf-py/pyproject.toml ] || { echo 'Incompatible llama.cpp or path'; exit 1; } + pushd "$llamacpppath"/gguf-py + if [ ! -d venv ]; then + python3 -m venv venv + ( + . venv/bin/activate + pip install -e . + ) + fi + popd + ln -s "$llamacpppath"/gguf-py/venv gguf_py_venv + fi +fi +if [ -d gguf_py_venv ]; then + . gguf_py_venv/bin/activate + dumper=gguf-dump + command -v "$dumper" >/dev/null 2>&1 || { echo 'Missing gguf-dump'; exit 1; } +fi + +if [ -z "$XDG_RUNTIME_DIR" ]; then + XDG_RUNTIME_DIR=/tmp # CI or macOS +else + size="$(sed -n "/^tmpfs ${XDG_RUNTIME_DIR//\//\\\/}/s/.\\+size=\\([0-9]\\+\\)k.\\+/\\1/p" /proc/mounts)" + if [ -n "$size" ] && [ "$size" -lt 4194304 ]; then + ( + set -x + sudo mount -o remount,size=4G "$XDG_RUNTIME_DIR" + ) + fi +fi + +[ -d Dia_GGUF ] || git clone https://huggingface.co/mmwillet2/Dia_GGUF +[ -d Kokoro_GGUF ] || git clone https://huggingface.co/mmwillet2/Kokoro_GGUF +[ -d parler-tts-mini-v1-GGUF ] || git clone https://huggingface.co/ecyht2/parler-tts-mini-v1-GGUF + +declare -a extra_args +function q { + model_dir="$(dirname "$model")" + log="$( + set -x + "$quantize" -mp "$model" -qt "$1" -qp "$XDG_RUNTIME_DIR"/test.gguf "${extra_args[@]}" 2>&1 + )" || echo -n "$log" + + new_hash="$(sha256sum "$XDG_RUNTIME_DIR"/test.gguf)" + new_hash="${new_hash% *}" + echo "$new_hash"' '"$XDG_RUNTIME_DIR"/test.gguf + old_hash="$(git -C "$model_dir" cat-file -p HEAD:"$2" | sed -n '2s/^oid sha256://p')" + echo "$old_hash"' '"$model_dir"/"$2" + + [ "$new_hash" != "$old_hash" ] && [ -n "$dumper" ] && "$dumper" "$XDG_RUNTIME_DIR"/test.gguf > "$XDG_RUNTIME_DIR"/test.gguf.gguf-dump.txt 2>/dev/null + unlink "$XDG_RUNTIME_DIR"/test.gguf + if [ "$new_hash" != "$old_hash" ] && [ -n "$dumper" ]; then + [ -f "$model_dir"/"$2".gguf-dump.txt ] || "$dumper" "$model_dir"/"$2" > "$model_dir"/"$2".gguf-dump.txt 2>/dev/null + diff -U3 "$model_dir"/"$2".gguf-dump.txt "$XDG_RUNTIME_DIR"/test.gguf.gguf-dump.txt || : + fi +} + +model=Dia_GGUF/Dia.gguf +extra_args=(-nt 3) +q F16 Dia_F16.gguf +q Q4 Dia_Q4.gguf +q Q5 Dia_Q5.gguf +q Q8 Dia_Q8.gguf +extra_args=(-nt 3 -df) +q F16 Dia_F16_DAC_F16.gguf +q Q4 Dia_Q4_DAC_F16.gguf +q Q5 Dia_Q5_DAC_F16.gguf +q Q8 Dia_Q8_DAC_F16.gguf + +model=Kokoro_GGUF/Kokoro_espeak.gguf +extra_args=(-nt 3 -nqf) +q F16 Kokoro_espeak_F16.gguf +q Q4 Kokoro_espeak_Q4.gguf +q Q5 Kokoro_espeak_Q5.gguf +q Q8 Kokoro_espeak_Q8.gguf +model=Kokoro_GGUF/Kokoro_no_espeak.gguf +q F16 Kokoro_no_espeak_F16.gguf +q Q4 Kokoro_no_espeak_Q4.gguf +q Q5 Kokoro_no_espeak_Q5.gguf +q Q8 Kokoro_no_espeak_Q8.gguf + +model=parler-tts-mini-v1-GGUF/parler-tts-mini-v1-fp32.gguf +extra_args=(-nt 3) +q FP16 parler-tts-mini-v1-fp16.gguf +q Q4_0 parler-tts-mini-v1-Q4_0.gguf +q Q5_0 parler-tts-mini-v1-Q5_0.gguf +q Q8_0 parler-tts-mini-v1-Q8_0.gguf + +rm "$XDG_RUNTIME_DIR"/test.gguf*