@@ -302,12 +302,26 @@ static void ggml_backend_copy_cache_ptrs(char **& backend_cache_ptrs, const char
302
302
cudaMemcpy (backend_cache_ptrs, host_cache_ptrs, size*sizeof (char *), cudaMemcpyHostToDevice);
303
303
}
304
304
305
- void ggml_backend_copy_k_cache_ptrs (const char ** host_cache_ptrs, size_t size) {
306
- ggml_backend_copy_cache_ptrs (k_cache_ptrs, host_cache_ptrs, size);
307
- }
308
-
309
- void ggml_backend_copy_v_cache_ptrs (const char ** host_cache_ptrs, size_t size) {
310
- ggml_backend_copy_cache_ptrs (v_cache_ptrs, host_cache_ptrs, size);
305
+ void ggml_backend_copy_kv_cache_ptrs (const int64_t n_layer, const int64_t kv_head, struct ggml_tensor ** kv_kl, struct ggml_tensor ** kv_vl, const int64_t n_embd_k_gqa,const int64_t n_embd_v_gqa, const bool flash_attn) {
306
+
307
+ std::vector<const char *> host_k_cache_ptrs;
308
+ std::vector<const char *> host_v_cache_ptrs;
309
+ for (int il = 0 ; il < n_layer; ++il) {
310
+ // K cache pointer for this layer
311
+ ggml_tensor * tmp_tensor = kv_kl[il];
312
+ size_t tmp_offset = (ggml_row_size (kv_kl[il]->type , n_embd_k_gqa))*kv_head;
313
+ host_k_cache_ptrs.push_back (static_cast <char *>(tmp_tensor->data ) + tmp_offset);
314
+ // V cache pointer for this layer
315
+ tmp_tensor = kv_vl[il];
316
+ if (flash_attn) {
317
+ tmp_offset = (kv_head)*ggml_row_size (kv_vl[il]->type , n_embd_v_gqa);
318
+ } else {
319
+ tmp_offset = (kv_head)*ggml_element_size (kv_vl[il]);
320
+ }
321
+ host_v_cache_ptrs.push_back (static_cast <char *>(tmp_tensor->data ) + tmp_offset);
322
+ }
323
+ ggml_backend_copy_cache_ptrs (k_cache_ptrs, host_k_cache_ptrs.data (), host_k_cache_ptrs.size ());
324
+ ggml_backend_copy_cache_ptrs (v_cache_ptrs, host_v_cache_ptrs.data (), host_v_cache_ptrs.size ());
311
325
}
312
326
313
327
static void ggml_cpy_f16_f32_cuda (
0 commit comments