Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/quantize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE ggml tts)

if (NOT WIN32)
add_custom_target(quantize_snapshot_test.sh ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh)
add_custom_command(OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/quantize_snapshot_test.sh
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/quantize_snapshot_test.sh
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/quantize_snapshot_test.sh)
add_dependencies(quantize_snapshot_test.sh quantize)
endif ()
34 changes: 20 additions & 14 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
#include "tts.h"
#include "args.h"
#include <stdio.h>
#include <thread>
#include "ggml.h"
#include <map>
#include <vector>

std::vector<ggml_type> valid_quantization_types = {
GGML_TYPE_F16,
GGML_TYPE_Q4_0,
GGML_TYPE_Q5_0,
GGML_TYPE_Q8_0,
#include "args.h"
#include "ggml.h"
#include "tts.h"

const std::map<std::string, ggml_type> valid_quantization_types = {
{"FP16", GGML_TYPE_F16},
{"F16", GGML_TYPE_F16},
{"Q4_0", GGML_TYPE_Q4_0},
{"Q4", GGML_TYPE_Q4_0},
{"Q5_0", GGML_TYPE_Q5_0},
{"Q5", GGML_TYPE_Q5_0},
{"Q8_0", GGML_TYPE_Q8_0},
{"Q8", GGML_TYPE_Q8_0},
};

int main(int argc, const char ** argv) {
int default_quantization = (int) GGML_TYPE_Q4_0;
int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1);
arg_list args;
args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini v1 to quantize.", "-mp", true));
args.add_argument(string_arg("--quantized-model-path", "(REQUIRED) The path to save the model in a quantized format.", "-qp", true));
args.add_argument(int_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, &default_quantization));
args.add_argument(string_arg("--quantized-type", "(OPTIONAL) The ggml enum of the quantized type to convert compatible model tensors to. For more information see readme. Defaults to Q4_0 quantization (2).", "-qt", false, "Q4_0"));
args.add_argument(int_arg("--n-threads", "(OPTIONAL) The number of cpu threads to run the quantization process with. Defaults to known hardware concurrency.", "-nt", false, &default_n_threads));
args.add_argument(bool_arg("--convert-dac-to-f16", "(OPTIONAL) Whether to convert the DAC audio decoder model to a 16 bit float.", "-df"));
args.add_argument(bool_arg("--quantize-output-heads", "(OPTIONAL) Whether to quantize the output heads. Defaults to false and is true when passed (does not accept a parameter).", "-qh"));
Expand All @@ -31,12 +36,13 @@ int main(int argc, const char ** argv) {
return 0;
}
args.validate();
enum ggml_type qtype = static_cast<ggml_type>(*args.get_int_param("--quantized-type"));
if (std::find(valid_quantization_types.begin(), valid_quantization_types.end(), qtype) == valid_quantization_types.end()) {
fprintf(stderr, "ERROR: %d is not a valid quantization type.\n", qtype);
std::string qtype = args.get_string_param("--quantized-type");
if (!valid_quantization_types.contains(qtype)) {
fprintf(stderr, "ERROR: %s is not a valid quantization type.\n",
qtype.c_str());
exit(1);
}
struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), qtype);
struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), valid_quantization_types.at(qtype));
qp->quantize_output_heads = args.get_bool_param("--quantize-output-heads");
qp->quantize_text_embeddings = args.get_bool_param("--quantize-text-embedding");
qp->quantize_cross_attn_kv = args.get_bool_param("--quantize-cross-attn-kv");
Expand Down
103 changes: 103 additions & 0 deletions examples/quantize/quantize_snapshot_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash
set -e
# Performs snapshot testing against mmwillet2's version on Hugging Face

# To avoid redownloading models, cd to a persistent directory instead of build/bin
[ $# -eq 0 ] || { echo 'Usage: ./quantize_snapshot_test.sh'; exit 1; }
quantize="$(dirname "$0")"/quantize
[ -x "$quantize" ] || { echo 'quantize binary in script directory is not executable'; exit 1; }

if [ ! -e gguf_py_venv ]; then
read -rp 'Path to llama.cpp (or blank): ' llamacpppath
if [ -z "$llamacpppath" ]; then
ln -s /dev/null gguf_py_venv
else
[ -f "$llamacpppath"/gguf-py/pyproject.toml ] || { echo 'Incompatible llama.cpp or path'; exit 1; }
pushd "$llamacpppath"/gguf-py
if [ ! -d venv ]; then
python3 -m venv venv
(
. venv/bin/activate
pip install -e .
)
fi
popd
ln -s "$llamacpppath"/gguf-py/venv gguf_py_venv
fi
fi
if [ -d gguf_py_venv ]; then
. gguf_py_venv/bin/activate
dumper=gguf-dump
command -v "$dumper" >/dev/null 2>&1 || { echo 'Missing gguf-dump'; exit 1; }
fi

if [ -z "$XDG_RUNTIME_DIR" ]; then
XDG_RUNTIME_DIR=/tmp # CI or macOS
else
size="$(sed -n "/^tmpfs ${XDG_RUNTIME_DIR//\//\\\/}/s/.\\+size=\\([0-9]\\+\\)k.\\+/\\1/p" /proc/mounts)"
if [ -n "$size" ] && [ "$size" -lt 4194304 ]; then
(
set -x
sudo mount -o remount,size=4G "$XDG_RUNTIME_DIR"
)
fi
fi

[ -d Dia_GGUF ] || git clone https://huggingface.co/mmwillet2/Dia_GGUF
[ -d Kokoro_GGUF ] || git clone https://huggingface.co/mmwillet2/Kokoro_GGUF
[ -d parler-tts-mini-v1-GGUF ] || git clone https://huggingface.co/ecyht2/parler-tts-mini-v1-GGUF

declare -a extra_args
function q {
model_dir="$(dirname "$model")"
log="$(
set -x
"$quantize" -mp "$model" -qt "$1" -qp "$XDG_RUNTIME_DIR"/test.gguf "${extra_args[@]}" 2>&1
)" || echo -n "$log"

new_hash="$(sha256sum "$XDG_RUNTIME_DIR"/test.gguf)"
new_hash="${new_hash% *}"
echo "$new_hash"' '"$XDG_RUNTIME_DIR"/test.gguf
old_hash="$(git -C "$model_dir" cat-file -p HEAD:"$2" | sed -n '2s/^oid sha256://p')"
echo "$old_hash"' '"$model_dir"/"$2"

[ "$new_hash" != "$old_hash" ] && [ -n "$dumper" ] && "$dumper" "$XDG_RUNTIME_DIR"/test.gguf > "$XDG_RUNTIME_DIR"/test.gguf.gguf-dump.txt 2>/dev/null
unlink "$XDG_RUNTIME_DIR"/test.gguf
if [ "$new_hash" != "$old_hash" ] && [ -n "$dumper" ]; then
[ -f "$model_dir"/"$2".gguf-dump.txt ] || "$dumper" "$model_dir"/"$2" > "$model_dir"/"$2".gguf-dump.txt 2>/dev/null
diff -U3 "$model_dir"/"$2".gguf-dump.txt "$XDG_RUNTIME_DIR"/test.gguf.gguf-dump.txt || :
fi
}

model=Dia_GGUF/Dia.gguf
extra_args=(-nt 3)
q F16 Dia_F16.gguf
q Q4 Dia_Q4.gguf
q Q5 Dia_Q5.gguf
q Q8 Dia_Q8.gguf
extra_args=(-nt 3 -df)
q F16 Dia_F16_DAC_F16.gguf
q Q4 Dia_Q4_DAC_F16.gguf
q Q5 Dia_Q5_DAC_F16.gguf
q Q8 Dia_Q8_DAC_F16.gguf

model=Kokoro_GGUF/Kokoro_espeak.gguf
extra_args=(-nt 3 -nqf)
q F16 Kokoro_espeak_F16.gguf
q Q4 Kokoro_espeak_Q4.gguf
q Q5 Kokoro_espeak_Q5.gguf
q Q8 Kokoro_espeak_Q8.gguf
model=Kokoro_GGUF/Kokoro_no_espeak.gguf
q F16 Kokoro_no_espeak_F16.gguf
q Q4 Kokoro_no_espeak_Q4.gguf
q Q5 Kokoro_no_espeak_Q5.gguf
q Q8 Kokoro_no_espeak_Q8.gguf

model=parler-tts-mini-v1-GGUF/parler-tts-mini-v1-fp32.gguf
extra_args=(-nt 3)
q FP16 parler-tts-mini-v1-fp16.gguf
q Q4_0 parler-tts-mini-v1-Q4_0.gguf
q Q5_0 parler-tts-mini-v1-Q5_0.gguf
q Q8_0 parler-tts-mini-v1-Q8_0.gguf

rm "$XDG_RUNTIME_DIR"/test.gguf*