-
Notifications
You must be signed in to change notification settings - Fork 12.9k
Description
Name and Version
System Specifications
- Operating System: Ubuntu 22.04
- CPU:
- Model: Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
- Cores: 6
- Threads: 12
- Architecture: x86_64
- Caches: L1d: 192KB, L1i: 192KB, L2: 1.5MB, L3: 12MB
- Governor: performance
- GPU:
- Model: Advanced Micro Devices, Inc. [AMD/ATI] Navi 23 [Radeon RX 6600/6600 XT/6600M]
- Driver Name: amdgpu
- Total VRAM: 8176MiB
- ROCm Version: 6.3.x
- RAM: 62GB
I'm getting a vector ADD error:
/home/gym/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:83: ROCm error
ggml_cuda_compute_forward: ADD failed
ROCm error: invalid device function
current device: 0, in function ggml_cuda_compute_forward at /home/gym/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:2504
err
[New LWP 28922]
[New LWP 28936]
[New LWP 28937]
[New LWP 28938]
[New LWP 28939]
[New LWP 28940]
[New LWP 28941]
[New LWP 28942]
[New LWP 28943]
[New LWP 28944]
[New LWP 28945]
[New LWP 28946]
[New LWP 28947]
[New LWP 28948]
[New LWP 28949]
[New LWP 28956]
[New LWP 28957]
[New LWP 28958]
[New LWP 28959]
[New LWP 28960]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
0x000078f52e8ea42f in __GI___wait4 (pid=28961, stat_loc=0x0, options=0, usage=0x0) at ../sysdeps/unix/sysv/linux/wait4.c:30
30 ../sysdeps/unix/sysv/linux/wait4.c: No such file or directory.
#0 0x000078f52e8ea42f in __GI___wait4 (pid=28961, stat_loc=0x0, options=0, usage=0x0) at ../sysdeps/unix/sysv/linux/wait4.c:30
30 in ../sysdeps/unix/sysv/linux/wait4.c
#1 0x000078f531497366 in ggml_print_backtrace () from /home/gym/llama.cpp/build/bin/libggml-base.so
#2 0x000078f5314975b9 in ggml_abort () from /home/gym/llama.cpp/build/bin/libggml-base.so
#3 0x000078f52f0f6fc2 in ggml_cuda_error(char const*, char const*, char const*, int, char const*) () from /home/gym/llama.cpp/build/bin/libggml-hip.so
#4 0x000078f52f0fdc63 in ggml_backend_cuda_graph_compute(ggml_backend*, ggml_cgraph*) () from /home/gym/llama.cpp/build/bin/libggml-hip.so
#5 0x000078f5314b1fab in ggml_backend_sched_graph_compute_async () from /home/gym/llama.cpp/build/bin/libggml-base.so
#6 0x000078f5312dbe81 in llama_context::graph_compute(ggml_cgraph*, bool) () from /home/gym/llama.cpp/build/bin/libllama.so
#7 0x000078f5312dbb07 in llama_context::process_ubatch(llama_ubatch const&, llm_graph_type, llama_memory_context_i*, ggml_status&) () from /home/gym/llama.cpp/build/bin/libllama.so
#8 0x000078f5312dcefe in llama_context::decode(llama_batch const&) () from /home/gym/llama.cpp/build/bin/libllama.so
#9 0x000078f5312e0efb in llama_decode () from /home/gym/llama.cpp/build/bin/libllama.so
#10 0x0000000000472d9b in common_init_from_params(common_params&) ()
#11 0x00000000002c8ffe in server_context::load_model(common_params const&) ()
#12 0x000000000028ca89 in main ()
Operating systems
Linux
GGML backends
BLAS
Hardware
RTX 6600XT
Models
Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf
Problem description & steps to reproduce
However, ollama runs fine on my device (and quite efficiently), with:
GGML_ROCM_ENABLE_UNIFIED_MEMORY=1 \
OLLAMA_NUM_PARALLEL=1 \
OLLAMA_KV_CACHE_TYPE=q8_0 \
OLLAMA_FLASH_ATTENTION=1 \
HSA_OVERRIDE_GFX_VERSION=10.3.0 \
ollama serve
Or equivalently in llamacpp:
GGML_ROCM_ENABLE_UNIFIED_MEMORY=1 \
sudo cpupower frequency-set -g performance && \
HSA_OVERRIDE_GFX_VERSION=10.3.0 \
/home/gym/llama.cpp/build/bin/llama-server \
--model /home/gym/llama.cpp/models/qwen7b/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf \
--host 0.0.0.0 \
--port 8000 \
--ctx-size 32000 \
--n-gpu-layers 1 \
-fa \
-ctkd q8_0
Of course llamacpp server works if I run solely on the cpu. Or if I install vulkan it runs, but prompt processing takes over a minute (with ollamacpp too) unlike rocm so I must get this working with rocm.
I wrote a simple cpp app, verifying vector ADD works:
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
// HIP kernel for vector addition
__global__ void vectorAdd(float* A, float* B, float* C, int numElements) {
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
if (i < numElements) {
C[i] = A[i] + B[i];
}
}
int main() {
int numElements = 10000;
size_t size = numElements * sizeof(float);
std::vector<float> h_A(numElements);
std::vector<float> h_B(numElements);
std::vector<float> h_C(numElements);
for (int i = 0; i < numElements; ++i) {
h_A[i] = i;
h_B[i] = i * 2;
}
float *d_A, *d_B, *d_C;
hipMalloc(&d_A, size);
hipMalloc(&d_B, size);
hipMalloc(&d_C, size);
hipMemcpy(d_A, h_A.data(), size, hipMemcpyHostToDevice);
hipMemcpy(d_B, h_B.data(), size, hipMemcpyHostToDevice);
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
hipLaunchKernelGGL(vectorAdd, dim3(numBlocks), dim3(blockSize), 0, 0, d_A, d_B, d_C, numElements);
hipMemcpy(h_C.data(), d_C, size, hipMemcpyDeviceToHost);
bool success = true;
for (int i = 0; i < numElements; ++i) {
if (h_C[i] != (h_A[i] + h_B[i])) {
success = false;
break;
}
}
if (success) {
std::cout << "Vector addition successful!" << std::endl;
} else {
std::cout << "Vector addition FAILED!" << std::endl;
}
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);
return 0;
}
/opt/rocm-6.3.0/bin/hipcc -o vector_add_test vector_add.cpp
vector_add.cpp:36:5: warning: ignoring return value of function declared with 'nodiscard' attribute [-Wunused-result]
9 warnings generated when compiling for host.
gym@gym-pc:~$ ./vector_add_test
Vector addition successful!
First Bad Commit
No response
Relevant log output
Setting cpu: 0
Setting cpu: 1
Setting cpu: 2
Setting cpu: 3
Setting cpu: 4
Setting cpu: 5
Setting cpu: 6
Setting cpu: 7
Setting cpu: 8
Setting cpu: 9
Setting cpu: 10
Setting cpu: 11
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 ROCm devices:
Device 0: AMD Radeon RX 6600 XT, gfx1030 (0x1030), VMM: no, Wave Size: 32
build: 6106 (5fd160bb) with AMD clang version 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.0 24455 f24aa3b4a91f6ee2fcd15629ba0b49fa545d8d6b) for x86_64-unknown-linux-gnu
system info: n_threads = 6, n_threads_batch = 6, total_threads = 12
system_info: n_threads = 6 (n_threads_batch = 6) / 12 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | F16C = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
main: binding port with default address family
main: HTTP server is listening, hostname: 0.0.0.0, port: 8000, http threads: 11
main: loading model
srv load_model: loading model '/home/gym/llama.cpp/models/qwen7b/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf'
llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6600 XT) - 8136 MiB free
llama_model_loader: loaded meta data with 38 key-value pairs and 339 tensors from /home/gym/llama.cpp/models/qwen7b/Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = qwen2
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = Qwen2.5 Coder 7B Instruct
llama_model_loader: - kv 3: general.finetune str = Instruct
llama_model_loader: - kv 4: general.basename str = Qwen2.5-Coder
llama_model_loader: - kv 5: general.size_label str = 7B
llama_model_loader: - kv 6: general.license str = apache-2.0
llama_model_loader: - kv 7: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-C...
llama_model_loader: - kv 8: general.base_model.count u32 = 1
llama_model_loader: - kv 9: general.base_model.0.name str = Qwen2.5 Coder 7B
llama_model_loader: - kv 10: general.base_model.0.organization str = Qwen
llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-C...
llama_model_loader: - kv 12: general.tags arr[str,6] = ["code", "codeqwen", "chat", "qwen", ...
llama_model_loader: - kv 13: general.languages arr[str,1] = ["en"]
llama_model_loader: - kv 14: qwen2.block_count u32 = 28
llama_model_loader: - kv 15: qwen2.context_length u32 = 32768
llama_model_loader: - kv 16: qwen2.embedding_length u32 = 3584
llama_model_loader: - kv 17: qwen2.feed_forward_length u32 = 18944
llama_model_loader: - kv 18: qwen2.attention.head_count u32 = 28
llama_model_loader: - kv 19: qwen2.attention.head_count_kv u32 = 4
llama_model_loader: - kv 20: qwen2.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 21: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 22: general.file_type u32 = 15
llama_model_loader: - kv 23: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 24: tokenizer.ggml.pre str = qwen2
llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv 26: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 27: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv 28: tokenizer.ggml.eos_token_id u32 = 151645
llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 151643
llama_model_loader: - kv 30: tokenizer.ggml.bos_token_id u32 = 151643
llama_model_loader: - kv 31: tokenizer.ggml.add_bos_token bool = false
llama_model_loader: - kv 32: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
llama_model_loader: - kv 33: general.quantization_version u32 = 2
llama_model_loader: - kv 34: quantize.imatrix.file str = /models_out/Qwen2.5-Coder-7B-Instruct...
llama_model_loader: - kv 35: quantize.imatrix.dataset str = /training_dir/calibration_datav3.txt
llama_model_loader: - kv 36: quantize.imatrix.entries_count i32 = 196
llama_model_loader: - kv 37: quantize.imatrix.chunks_count i32 = 128
llama_model_loader: - type f32: 141 tensors
llama_model_loader: - type q4_K: 169 tensors
llama_model_loader: - type q6_K: 29 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type = Q4_K - Medium
print_info: file size = 4.36 GiB (4.91 BPW)
load: printing all EOG tokens:
load: - 151643 ('<|endoftext|>')
load: - 151645 ('<|im_end|>')
load: - 151662 ('<|fim_pad|>')
load: - 151663 ('<|repo_name|>')
load: - 151664 ('<|file_sep|>')
load: special tokens cache size = 22
load: token to piece cache size = 0.9310 MB
print_info: arch = qwen2
print_info: vocab_only = 0
print_info: n_ctx_train = 32768
print_info: n_embd = 3584
print_info: n_layer = 28
print_info: n_head = 28
print_info: n_head_kv = 4
print_info: n_rot = 128
print_info: n_swa = 0
print_info: is_swa_any = 0
print_info: n_embd_head_k = 128
print_info: n_embd_head_v = 128
print_info: n_gqa = 7
print_info: n_embd_k_gqa = 512
print_info: n_embd_v_gqa = 512
print_info: f_norm_eps = 0.0e+00
print_info: f_norm_rms_eps = 1.0e-06
print_info: f_clamp_kqv = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: f_attn_scale = 0.0e+00
print_info: n_ff = 18944
print_info: n_expert = 0
print_info: n_expert_used = 0
print_info: causal attn = 1
print_info: pooling type = -1
print_info: rope type = 2
print_info: rope scaling = linear
print_info: freq_base_train = 1000000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn = 32768
print_info: rope_finetuned = unknown
print_info: model type = 7B
print_info: model params = 7.62 B
print_info: general.name = Qwen2.5 Coder 7B Instruct
print_info: vocab type = BPE
print_info: n_vocab = 152064
print_info: n_merges = 151387
print_info: BOS token = 151643 '<|endoftext|>'
print_info: EOS token = 151645 '<|im_end|>'
print_info: EOT token = 151645 '<|im_end|>'
print_info: PAD token = 151643 '<|endoftext|>'
print_info: LF token = 198 'Ċ'
print_info: FIM PRE token = 151659 '<|fim_prefix|>'
print_info: FIM SUF token = 151661 '<|fim_suffix|>'
print_info: FIM MID token = 151660 '<|fim_middle|>'
print_info: FIM PAD token = 151662 '<|fim_pad|>'
print_info: FIM REP token = 151663 '<|repo_name|>'
print_info: FIM SEP token = 151664 '<|file_sep|>'
print_info: EOG token = 151643 '<|endoftext|>'
print_info: EOG token = 151645 '<|im_end|>'
print_info: EOG token = 151662 '<|fim_pad|>'
print_info: EOG token = 151663 '<|repo_name|>'
print_info: EOG token = 151664 '<|file_sep|>'
print_info: max token length = 256
load_tensors: loading model tensors, this can take a while... (mmap = true)
load_tensors: offloading 1 repeating layers to GPU
load_tensors: offloaded 1/29 layers to GPU
load_tensors: ROCm0 model buffer size = 142.21 MiB
load_tensors: CPU_Mapped model buffer size = 4460.45 MiB
....................................................................................
llama_context: constructing llama_context
llama_context: n_seq_max = 1
llama_context: n_ctx = 32000
llama_context: n_ctx_per_seq = 32000
llama_context: n_batch = 2048
llama_context: n_ubatch = 512
llama_context: causal_attn = 1
llama_context: flash_attn = 1
llama_context: kv_unified = false
llama_context: freq_base = 1000000.0
llama_context: freq_scale = 1
llama_context: n_ctx_per_seq (32000) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_context: CPU output buffer size = 0.58 MiB
llama_kv_cache_unified: ROCm0 KV buffer size = 62.50 MiB
llama_kv_cache_unified: CPU KV buffer size = 1687.50 MiB
llama_kv_cache_unified: size = 1750.00 MiB ( 32000 cells, 28 layers, 1/1 seqs), K (f16): 875.00 MiB, V (f16): 875.00 MiB
llama_context: ROCm0 compute buffer size = 730.36 MiB
llama_context: ROCm_Host compute buffer size = 69.51 MiB
llama_context: graph nodes = 959
llama_context: graph splits = 382 (with bs=512), 3 (with bs=1)
common_init_from_params: added <|endoftext|> logit bias = -inf
common_init_from_params: added <|im_end|> logit bias = -inf
common_init_from_params: added <|fim_pad|> logit bias = -inf
common_init_from_params: added <|repo_name|> logit bias = -inf
common_init_from_params: added <|file_sep|> logit bias = -inf
common_init_from_params: setting dry_penalty_last_n to ctx_size = 32000
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
/home/gym/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:83: ROCm error
ggml_cuda_compute_forward: ADD failed
ROCm error: invalid device function
current device: 0, in function ggml_cuda_compute_forward at /home/gym/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:2504
err
[New LWP 33736]
[New LWP 33744]
[New LWP 33745]
[New LWP 33746]
[New LWP 33747]
[New LWP 33748]
[New LWP 33749]
[New LWP 33750]
[New LWP 33751]
[New LWP 33752]
[New LWP 33753]
[New LWP 33754]
[New LWP 33755]
[New LWP 33756]
[New LWP 33757]
[New LWP 33764]
[New LWP 33765]
[New LWP 33766]
[New LWP 33767]
[New LWP 33768]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
0x0000744403cea42f in __GI___wait4 (pid=33775, stat_loc=0x0, options=0, usage=0x0) at ../sysdeps/unix/sysv/linux/wait4.c:30
30 ../sysdeps/unix/sysv/linux/wait4.c: No such file or directory.
#0 0x0000744403cea42f in __GI___wait4 (pid=33775, stat_loc=0x0, options=0, usage=0x0) at ../sysdeps/unix/sysv/linux/wait4.c:30
30 in ../sysdeps/unix/sysv/linux/wait4.c
#1 0x000074440638b366 in ggml_print_backtrace () from /home/gym/llama.cpp/build/bin/libggml-base.so
#2 0x000074440638b5b9 in ggml_abort () from /home/gym/llama.cpp/build/bin/libggml-base.so
#3 0x00007444044f6fc2 in ggml_cuda_error(char const*, char const*, char const*, int, char const*) () from /home/gym/llama.cpp/build/bin/libggml-hip.so
#4 0x00007444044fdc63 in ggml_backend_cuda_graph_compute(ggml_backend*, ggml_cgraph*) () from /home/gym/llama.cpp/build/bin/libggml-hip.so
#5 0x00007444063a5fab in ggml_backend_sched_graph_compute_async () from /home/gym/llama.cpp/build/bin/libggml-base.so
#6 0x00007444064dbe81 in llama_context::graph_compute(ggml_cgraph*, bool) () from /home/gym/llama.cpp/build/bin/libllama.so
#7 0x00007444064dbb07 in llama_context::process_ubatch(llama_ubatch const&, llm_graph_type, llama_memory_context_i*, ggml_status&) () from /home/gym/llama.cpp/build/bin/libllama.so
#8 0x00007444064dcefe in llama_context::decode(llama_batch const&) () from /home/gym/llama.cpp/build/bin/libllama.so
#9 0x00007444064e0efb in llama_decode () from /home/gym/llama.cpp/build/bin/libllama.so
#10 0x0000000000472d9b in common_init_from_params(common_params&) ()
#11 0x00000000002c8ffe in server_context::load_model(common_params const&) ()
#12 0x000000000028ca89 in main ()
[Inferior 1 (process 33733) detached]
Aborted (core dumped)