Skip to content

Commit c9f3add

Browse files
apicalsharkpminevykhrustalevggerganovleo-pony
authored
Temp (#15)
* metal : fix minor string leaks (ggml/1004) * cmake : make it possible linking ggml as external lib (ggml/1003) * sync : ggml * CANN: adjust backend registry refactor. (ggml-org#10158) remove buffer->iface.get_name that used in cann as it was removed in backend registry refactor PR. * metal : move dequantize templates to beginning of MSL source (#0) * metal : simplify f16 and f32 dequant kernels (#0) * cuda : clear error after changing peer access (ggml-org#10153) * fix build break on arm64 linux (ggml-org#10166) This fixes the build break from the recent changes to move the CPU backend to separate files ggml-org#10144 * server : clarify /slots endpoint, add is_processing (ggml-org#10162) * server : clarify /slots endpoint, add is_processing * fix tests * ggml : fix q4xx mat mul, increase ggml_aligned_malloc alignment (ggml-org#10167) * ggml : fix gelu tables initialization (ggml-org#10172) * Q6_K AVX improvements (ggml-org#10118) * q6_k instruction reordering attempt * better subtract method * should be theoretically faster small improvement with shuffle lut, likely because all loads are already done at that stage * optimize bit fiddling * handle -32 offset separately. bsums exists for a reason! * use shift * Update ggml-quants.c * have to update ci macos version to 13 as 12 doesnt work now. 13 is still x86 * ggml : fix arch check in bf16_to_fp32 (ggml-org#10164) * llama : add <|tool_call|> formatting to Granite template (ggml-org#10177) Branch: GraniteToolCallTemplate Signed-off-by: Gabe Goodhart <[email protected]> * metal : add quantized FA support (ggml-org#10149) * metal : add quantized FA (vec) support ggml-ci * metal : add quantized FA (non-vec) support * metal : fix support check ggml-ci * metal : clean-up * metal : clean-up (cont) * metal : fix shared memory calc + reduce smem + comments * metal : float-correctness * metal : minor [no ci] * ggml : adjust is_first_call init value (ggml-org#10193) ggml-ci * metal : fix from ptr buffer name (ggml-org#10189) * server : remove hack for extra parallel slot (ggml-org#10187) ggml-ci * metal : add BF16 support (ggml-org#8439) * ggml : add initial BF16 support ggml-ci * metal : add mul_mat_id BF16 support ggml-ci * metal : check for bfloat support on the Metal device ggml-ci * metal : better var names [no ci] * metal : do not build bfloat kernels when not supported ggml-ci * metal : try to fix BF16 support check ggml-ci * metal : this should correctly check bfloat support --------- Signed-off-by: Gabe Goodhart <[email protected]> Co-authored-by: Plamen Minev <[email protected]> Co-authored-by: Yuri Khrustalev <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: leo-pony <[email protected]> Co-authored-by: Diego Devesa <[email protected]> Co-authored-by: snadampal <[email protected]> Co-authored-by: Xuan Son Nguyen <[email protected]> Co-authored-by: Eve <[email protected]> Co-authored-by: Gabe Goodhart <[email protected]>
1 parent f035dba commit c9f3add

File tree

16 files changed

+1877
-1367
lines changed

16 files changed

+1877
-1367
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ env:
6363

6464
jobs:
6565

66+
6667
# CUDA Release
6768

6869
ubuntu-latest-cmake:

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10031003
if (s == "f16") {
10041004
return GGML_TYPE_F16;
10051005
}
1006+
if (s == "bf16") {
1007+
return GGML_TYPE_BF16;
1008+
}
10061009
if (s == "q8_0") {
10071010
return GGML_TYPE_Q8_0;
10081011
}

examples/server/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
692692

693693
### GET `/slots`: Returns the current slots processing state
694694

695-
This endpoint can be disabled with `--no-slots`
695+
> [!WARNING]
696+
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
697+
698+
This endpoint is disabled by default and can be enabled with `--slots`
696699

697700
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
698701

@@ -709,6 +712,7 @@ Example:
709712
"grammar": "",
710713
"id": 0,
711714
"ignore_eos": false,
715+
"is_processing": false,
712716
"logit_bias": [],
713717
"min_p": 0.05000000074505806,
714718
"mirostat": 0,
@@ -741,7 +745,6 @@ Example:
741745
"temperature"
742746
],
743747
"seed": 42,
744-
"state": 1,
745748
"stop": [
746749
"\n"
747750
],
@@ -755,10 +758,6 @@ Example:
755758
]
756759
```
757760

758-
Possible values for `slot[i].state` are:
759-
- `0`: SLOT_STATE_IDLE
760-
- `1`: SLOT_STATE_PROCESSING
761-
762761
### GET `/metrics`: Prometheus compatible metrics exporter
763762

764763
This endpoint is only accessible if `--metrics` is set.

examples/server/server.cpp

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -378,8 +378,8 @@ struct server_queue {
378378
std::condition_variable condition_tasks;
379379

380380
// callback functions
381-
std::function<void(server_task&)> callback_new_task;
382-
std::function<void(void)> callback_update_slots;
381+
std::function<void(server_task)> callback_new_task;
382+
std::function<void(void)> callback_update_slots;
383383

384384
// Add a new task to the end of the queue
385385
int post(server_task task, bool front = false) {
@@ -431,7 +431,7 @@ struct server_queue {
431431
}
432432

433433
// Register function to process a new task
434-
void on_new_task(std::function<void(server_task &)> callback) {
434+
void on_new_task(std::function<void(server_task)> callback) {
435435
callback_new_task = std::move(callback);
436436
}
437437

@@ -481,7 +481,7 @@ struct server_queue {
481481
lock.unlock();
482482

483483
QUE_DBG("processing task, id = %d\n", task.id);
484-
callback_new_task(task);
484+
callback_new_task(std::move(task));
485485
}
486486

487487
// all tasks in the current loop is processed, slots data is now ready
@@ -644,17 +644,12 @@ struct server_context {
644644
bool load_model(const common_params & params_) {
645645
params = params_;
646646

647-
// reserve one extra sequence (seq_id == 0) for extra features
648-
params.n_parallel += 1;
649-
650647
common_init_result llama_init = common_init_from_params(params);
651648

652649
model = llama_init.model;
653650
ctx = llama_init.context;
654651
loras = llama_init.lora_adapters;
655652

656-
params.n_parallel -= 1; // but be sneaky about it
657-
658653
if (model == nullptr) {
659654
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
660655
return false;
@@ -1288,16 +1283,16 @@ struct server_context {
12881283

12891284
void send_embedding(const server_slot & slot, const llama_batch & batch) {
12901285
server_task_result res;
1291-
res.id = slot.id_task;
1292-
res.error = false;
1293-
res.stop = true;
1286+
res.id = slot.id_task;
1287+
res.error = false;
1288+
res.stop = true;
12941289

12951290
const int n_embd = llama_n_embd(model);
12961291

12971292
std::vector<float> embd_res(n_embd, 0.0f);
12981293

12991294
for (int i = 0; i < batch.n_tokens; ++i) {
1300-
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
1295+
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
13011296
continue;
13021297
}
13031298

@@ -1332,12 +1327,12 @@ struct server_context {
13321327

13331328
void send_rerank(const server_slot & slot, const llama_batch & batch) {
13341329
server_task_result res;
1335-
res.id = slot.id_task;
1336-
res.error = false;
1337-
res.stop = true;
1330+
res.id = slot.id_task;
1331+
res.error = false;
1332+
res.stop = true;
13381333

13391334
for (int i = 0; i < batch.n_tokens; ++i) {
1340-
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
1335+
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
13411336
continue;
13421337
}
13431338

@@ -1510,7 +1505,7 @@ struct server_context {
15101505
// Functions to process the task
15111506
//
15121507

1513-
void process_single_task(const server_task & task) {
1508+
void process_single_task(server_task task) {
15141509
switch (task.type) {
15151510
case SERVER_TASK_TYPE_INFERENCE:
15161511
{
@@ -1566,11 +1561,11 @@ struct server_context {
15661561

15671562
for (server_slot & slot : slots) {
15681563
json slot_data = get_formated_generation(slot);
1569-
slot_data["id"] = slot.id;
1570-
slot_data["id_task"] = slot.id_task;
1571-
slot_data["state"] = slot.state;
1572-
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
1573-
slot_data["next_token"] = {
1564+
slot_data["id"] = slot.id;
1565+
slot_data["id_task"] = slot.id_task;
1566+
slot_data["is_processing"] = slot.is_processing();
1567+
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
1568+
slot_data["next_token"] = {
15741569
{"has_next_token", slot.has_next_token},
15751570
{"has_new_line", slot.has_new_line},
15761571
{"n_remain", slot.n_remaining},
@@ -1581,10 +1576,10 @@ struct server_context {
15811576
{"stopping_word", slot.stopping_word},
15821577
};
15831578

1584-
if (slot_data["state"] == SLOT_STATE_IDLE) {
1585-
n_idle_slots++;
1586-
} else {
1579+
if (slot.is_processing()) {
15871580
n_processing_slots++;
1581+
} else {
1582+
n_idle_slots++;
15881583
}
15891584

15901585
slots_data.push_back(slot_data);
@@ -1646,7 +1641,7 @@ struct server_context {
16461641
std::string filename = task.data.at("filename");
16471642
std::string filepath = task.data.at("filepath");
16481643

1649-
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
1644+
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
16501645

16511646
const int64_t t_end = ggml_time_us();
16521647
const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -1688,7 +1683,7 @@ struct server_context {
16881683

16891684
slot->cache_tokens.resize(slot->n_ctx);
16901685
size_t token_count = 0;
1691-
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
1686+
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
16921687
if (nread == 0) {
16931688
slot->cache_tokens.resize(0);
16941689
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
@@ -1731,7 +1726,7 @@ struct server_context {
17311726

17321727
// Erase token cache
17331728
const size_t n_erased = slot->cache_tokens.size();
1734-
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
1729+
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
17351730
slot->cache_tokens.clear();
17361731

17371732
server_task_result result;
@@ -1808,8 +1803,8 @@ struct server_context {
18081803

18091804
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
18101805

1811-
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
1812-
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
1806+
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
1807+
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
18131808

18141809
if (slot.params.cache_prompt) {
18151810
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1836,7 +1831,7 @@ struct server_context {
18361831

18371832
slot.i_batch = batch.n_tokens;
18381833

1839-
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
1834+
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
18401835

18411836
slot.n_past += 1;
18421837

@@ -1983,8 +1978,8 @@ struct server_context {
19831978

19841979
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
19851980

1986-
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
1987-
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
1981+
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
1982+
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
19881983

19891984
for (size_t i = 0; i < n_match; i++) {
19901985
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2033,9 +2028,9 @@ struct server_context {
20332028
}
20342029

20352030
// keep only the common part
2036-
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
2031+
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
20372032
// could not partially delete (likely using a non-Transformer model)
2038-
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
2033+
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
20392034

20402035
// there is no common part left
20412036
slot.n_past = 0;
@@ -2048,7 +2043,7 @@ struct server_context {
20482043

20492044
// add prompt tokens for processing in the current batch
20502045
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
2051-
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
2046+
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
20522047

20532048
if (slot.params.cache_prompt) {
20542049
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);

examples/server/tests/features/steps/steps.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
260260
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
261261
match expected_slot_status_string:
262262
case 'idle':
263-
expected_slot_status = 0
263+
expected_slot_status = False
264264
case 'busy':
265-
expected_slot_status = 1
265+
expected_slot_status = True
266266
case _:
267267
assert False, "unknown status"
268268

269-
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
269+
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
270270
for slot_id in range(context.n_slots)]
271271
await request_slots_status(context, expected_slots)
272272

@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
13541354
if status_code == 503 and status_code == expected_http_status_code:
13551355
return
13561356
if status_code == 200 and status_code == expected_http_status_code:
1357-
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
1358-
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
1357+
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
1358+
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
13591359
if ((slots_idle is None or slots_idle == n_slots_idle)
13601360
and (slots_processing is None or slots_processing == n_slots_processing)):
13611361
return

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1406,7 +1406,7 @@ if (EMSCRIPTEN)
14061406
endif()
14071407

14081408
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
1409-
target_include_directories(ggml PUBLIC ../include)
1409+
target_include_directories(ggml PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
14101410
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
14111411
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
14121412
target_compile_features (ggml PRIVATE c_std_11) # don't bump

ggml/src/ggml-cann.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
12271227

12281228
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
12291229
buffer->buft = buft;
1230-
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
12311230
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
12321231

12331232
return buffer;

ggml/src/ggml-cpu.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
304304
.nrows = 1,
305305
},
306306
[GGML_TYPE_Q8_0] = {
307+
.from_float_to_mat = quantize_mat_q8_0,
307308
.vec_dot = ggml_vec_dot_q8_0_q8_0,
308309
.vec_dot_type = GGML_TYPE_Q8_0,
309310
#if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -13677,31 +13678,35 @@ int ggml_cpu_get_sve_cnt(void) {
1367713678
}
1367813679

1367913680
void ggml_cpu_init(void) {
13681+
// needed to initialize f16 tables
13682+
{
13683+
struct ggml_init_params params = { 0, NULL, false };
13684+
struct ggml_context * ctx = ggml_init(params);
13685+
ggml_free(ctx);
13686+
}
13687+
1368013688
ggml_critical_section_start();
1368113689

1368213690
static bool is_first_call = true;
1368313691

1368413692
if (is_first_call) {
1368513693
// initialize GELU, Quick GELU, SILU and EXP F32 tables
1368613694
{
13687-
// FIXME: this may be called before ggml_init
13688-
//const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
13695+
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1368913696

1369013697
for (int i = 0; i < (1 << 16); ++i) {
1369113698
union {
1369213699
uint16_t u16;
1369313700
ggml_fp16_t fp16;
1369413701
} u = {i};
13695-
// FIXME: this table is used in conversion functions outside of compute
13696-
// current code depends on ggml_init initializing this table
13697-
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
13702+
float f = GGML_FP16_TO_FP32(u.fp16);
1369813703
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1369913704
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
1370013705
}
1370113706

13702-
//const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
13707+
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
1370313708

13704-
//GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
13709+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
1370513710
}
1370613711

1370713712
#if defined(__ARM_ARCH)

ggml/src/ggml-cuda.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
12971297
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
12981298
if (err != cudaErrorPeerAccessAlreadyEnabled) {
12991299
CUDA_CHECK(err);
1300+
} else {
1301+
// reset the error
1302+
cudaGetLastError();
13001303
}
13011304
} else {
13021305
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
13031306
if (err != cudaErrorPeerAccessNotEnabled) {
13041307
CUDA_CHECK(err);
1308+
} else {
1309+
// reset the error
1310+
cudaGetLastError();
13051311
}
13061312
}
13071313
}

0 commit comments

Comments
 (0)