Skip to content

Temp #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Nov 7, 2024
Merged

Temp #15

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e2292aa
metal : fix minor string leaks (ggml/1004)
pminev Nov 1, 2024
284e5b0
cmake : make it possible linking ggml as external lib (ggml/1003)
ykhrustalev Nov 2, 2024
ce027ad
sync : ggml
ggerganov Nov 4, 2024
329ed91
CANN: adjust backend registry refactor. (#10158)
leo-pony Nov 4, 2024
f8e5813
metal : move dequantize templates to beginning of MSL source (#0)
ggerganov Nov 4, 2024
05697f6
metal : simplify f16 and f32 dequant kernels (#0)
ggerganov Nov 4, 2024
ea02c75
cuda : clear error after changing peer access (#10153)
slaren Nov 4, 2024
6a066b9
fix build break on arm64 linux (#10166)
snadampal Nov 4, 2024
9e0ecfb
server : clarify /slots endpoint, add is_processing (#10162)
ngxson Nov 4, 2024
401558b
ggml : fix q4xx mat mul, increase ggml_aligned_malloc alignment (#10167)
slaren Nov 4, 2024
d5a409e
ggml : fix gelu tables initialization (#10172)
slaren Nov 4, 2024
3407364
Q6_K AVX improvements (#10118)
netrunnereve Nov 4, 2024
a9e8a9a
ggml : fix arch check in bf16_to_fp32 (#10164)
slaren Nov 4, 2024
b8deef0
llama : add <|tool_call|> formatting to Granite template (#10177)
gabe-l-hart Nov 5, 2024
a1eaf6a
metal : add quantized FA support (#10149)
ggerganov Nov 6, 2024
1dc04b2
ggml : adjust is_first_call init value (#10193)
ggerganov Nov 6, 2024
94d8cb8
metal : fix from ptr buffer name (#10189)
slaren Nov 6, 2024
b11f9ba
server : remove hack for extra parallel slot (#10187)
ggerganov Nov 6, 2024
5c333e0
metal : add BF16 support (#8439)
ggerganov Nov 6, 2024
109df6a
Merge branch 'master' into temp
apicalshark Nov 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ env:

jobs:


# CUDA Release

ubuntu-latest-cmake:
Expand Down
3 changes: 3 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
Expand Down
11 changes: 5 additions & 6 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte

### GET `/slots`: Returns the current slots processing state

This endpoint can be disabled with `--no-slots`
> [!WARNING]
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.

This endpoint is disabled by default and can be enabled with `--slots`

If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.

Expand All @@ -709,6 +712,7 @@ Example:
"grammar": "",
"id": 0,
"ignore_eos": false,
"is_processing": false,
"logit_bias": [],
"min_p": 0.05000000074505806,
"mirostat": 0,
Expand Down Expand Up @@ -741,7 +745,6 @@ Example:
"temperature"
],
"seed": 42,
"state": 1,
"stop": [
"\n"
],
Expand All @@ -755,10 +758,6 @@ Example:
]
```

Possible values for `slot[i].state` are:
- `0`: SLOT_STATE_IDLE
- `1`: SLOT_STATE_PROCESSING

### GET `/metrics`: Prometheus compatible metrics exporter

This endpoint is only accessible if `--metrics` is set.
Expand Down
69 changes: 32 additions & 37 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ struct server_queue {
std::condition_variable condition_tasks;

// callback functions
std::function<void(server_task&)> callback_new_task;
std::function<void(void)> callback_update_slots;
std::function<void(server_task)> callback_new_task;
std::function<void(void)> callback_update_slots;

// Add a new task to the end of the queue
int post(server_task task, bool front = false) {
Expand Down Expand Up @@ -431,7 +431,7 @@ struct server_queue {
}

// Register function to process a new task
void on_new_task(std::function<void(server_task &)> callback) {
void on_new_task(std::function<void(server_task)> callback) {
callback_new_task = std::move(callback);
}

Expand Down Expand Up @@ -481,7 +481,7 @@ struct server_queue {
lock.unlock();

QUE_DBG("processing task, id = %d\n", task.id);
callback_new_task(task);
callback_new_task(std::move(task));
}

// all tasks in the current loop is processed, slots data is now ready
Expand Down Expand Up @@ -644,17 +644,12 @@ struct server_context {
bool load_model(const common_params & params_) {
params = params_;

// reserve one extra sequence (seq_id == 0) for extra features
params.n_parallel += 1;

common_init_result llama_init = common_init_from_params(params);

model = llama_init.model;
ctx = llama_init.context;
loras = llama_init.lora_adapters;

params.n_parallel -= 1; // but be sneaky about it

if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
return false;
Expand Down Expand Up @@ -1288,16 +1283,16 @@ struct server_context {

void send_embedding(const server_slot & slot, const llama_batch & batch) {
server_task_result res;
res.id = slot.id_task;
res.error = false;
res.stop = true;
res.id = slot.id_task;
res.error = false;
res.stop = true;

const int n_embd = llama_n_embd(model);

std::vector<float> embd_res(n_embd, 0.0f);

for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}

Expand Down Expand Up @@ -1332,12 +1327,12 @@ struct server_context {

void send_rerank(const server_slot & slot, const llama_batch & batch) {
server_task_result res;
res.id = slot.id_task;
res.error = false;
res.stop = true;
res.id = slot.id_task;
res.error = false;
res.stop = true;

for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}

Expand Down Expand Up @@ -1510,7 +1505,7 @@ struct server_context {
// Functions to process the task
//

void process_single_task(const server_task & task) {
void process_single_task(server_task task) {
switch (task.type) {
case SERVER_TASK_TYPE_INFERENCE:
{
Expand Down Expand Up @@ -1566,11 +1561,11 @@ struct server_context {

for (server_slot & slot : slots) {
json slot_data = get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["id_task"] = slot.id_task;
slot_data["state"] = slot.state;
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
slot_data["next_token"] = {
slot_data["id"] = slot.id;
slot_data["id_task"] = slot.id_task;
slot_data["is_processing"] = slot.is_processing();
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
{"has_new_line", slot.has_new_line},
{"n_remain", slot.n_remaining},
Expand All @@ -1581,10 +1576,10 @@ struct server_context {
{"stopping_word", slot.stopping_word},
};

if (slot_data["state"] == SLOT_STATE_IDLE) {
n_idle_slots++;
} else {
if (slot.is_processing()) {
n_processing_slots++;
} else {
n_idle_slots++;
}

slots_data.push_back(slot_data);
Expand Down Expand Up @@ -1646,7 +1641,7 @@ struct server_context {
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");

const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);

const int64_t t_end = ggml_time_us();
const double t_save_ms = (t_end - t_start) / 1000.0;
Expand Down Expand Up @@ -1688,7 +1683,7 @@ struct server_context {

slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0;
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
if (nread == 0) {
slot->cache_tokens.resize(0);
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
Expand Down Expand Up @@ -1731,7 +1726,7 @@ struct server_context {

// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
slot->cache_tokens.clear();

server_task_result result;
Expand Down Expand Up @@ -1808,8 +1803,8 @@ struct server_context {

SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);

llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);

if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
Expand All @@ -1836,7 +1831,7 @@ struct server_context {

slot.i_batch = batch.n_tokens;

common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);

slot.n_past += 1;

Expand Down Expand Up @@ -1983,8 +1978,8 @@ struct server_context {

const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;

llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);

for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
Expand Down Expand Up @@ -2033,9 +2028,9 @@ struct server_context {
}

// keep only the common part
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
// could not partially delete (likely using a non-Transformer model)
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);

// there is no common part left
slot.n_past = 0;
Expand All @@ -2048,7 +2043,7 @@ struct server_context {

// add prompt tokens for processing in the current batch
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);

if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
Expand Down
10 changes: 5 additions & 5 deletions examples/server/tests/features/steps/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
match expected_slot_status_string:
case 'idle':
expected_slot_status = 0
expected_slot_status = False
case 'busy':
expected_slot_status = 1
expected_slot_status = True
case _:
assert False, "unknown status"

expected_slots = [{'id': slot_id, 'state': expected_slot_status}
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
for slot_id in range(context.n_slots)]
await request_slots_status(context, expected_slots)

Expand Down Expand Up @@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
if status_code == 503 and status_code == expected_http_status_code:
return
if status_code == 200 and status_code == expected_http_status_code:
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
if ((slots_idle is None or slots_idle == n_slots_idle)
and (slots_processing is None or slots_processing == n_slots_processing)):
return
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1406,7 +1406,7 @@ if (EMSCRIPTEN)
endif()

target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm

ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;

return buffer;
Expand Down
19 changes: 12 additions & 7 deletions ggml/src/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.nrows = 1,
},
[GGML_TYPE_Q8_0] = {
.from_float_to_mat = quantize_mat_q8_0,
.vec_dot = ggml_vec_dot_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
Expand Down Expand Up @@ -13677,31 +13678,35 @@ int ggml_cpu_get_sve_cnt(void) {
}

void ggml_cpu_init(void) {
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}

ggml_critical_section_start();

static bool is_first_call = true;

if (is_first_call) {
// initialize GELU, Quick GELU, SILU and EXP F32 tables
{
// FIXME: this may be called before ggml_init
//const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

for (int i = 0; i < (1 << 16); ++i) {
union {
uint16_t u16;
ggml_fp16_t fp16;
} u = {i};
// FIXME: this table is used in conversion functions outside of compute
// current code depends on ggml_init initializing this table
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
float f = GGML_FP16_TO_FP32(u.fp16);
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
}

//const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

//GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
}

#if defined(__ARM_ARCH)
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
cudaGetLastError();
}
}
}
Expand Down
Loading
Loading