Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 1 addition & 87 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7894,9 +7894,9 @@ static int llama_decode_internal(
const auto n_batch = cparams.n_batch;

GGML_ASSERT(n_tokens <= n_batch);
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

const int64_t t_start_us = ggml_time_us();

Expand Down Expand Up @@ -10062,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
}
}

void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
llama_sample_temp(ctx, candidates_p, temp);
}

void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
Expand Down Expand Up @@ -10192,38 +10188,6 @@ void llama_sample_apply_guidance(
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}

void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale) {
GGML_ASSERT(ctx);
int64_t t_start_sample_us;

t_start_sample_us = ggml_time_us();
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));

GGML_ASSERT(n_vocab == candidates->size);
GGML_ASSERT(!candidates->sorted);

std::vector<float> logits_base(n_vocab);
for (size_t i = 0; i < n_vocab; ++i) {
logits_base[i] = candidates->data[i].logit;
}

float * logits_guidance = llama_get_logits(guidance_ctx);

ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
t_start_sample_us = ggml_time_us();

for (size_t i = 0; i < n_vocab; ++i) {
candidates->data[i].logit = logits_base[i];
}

ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}

llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
GGML_ASSERT(ctx);

Expand Down Expand Up @@ -11724,15 +11688,6 @@ bool llama_supports_gpu_offload(void) {
#endif
}

// deprecated:
bool llama_mmap_supported(void) {
return llama_supports_mmap();
}

bool llama_mlock_supported(void) {
return llama_supports_mlock();
}

void llama_backend_init(void) {
ggml_time_init();

Expand Down Expand Up @@ -12244,15 +12199,6 @@ uint32_t llama_model_quantize(
}
}

int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}

int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
Expand Down Expand Up @@ -12802,38 +12748,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
return true;
}

int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}

return ret;
}

int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };

const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}

return ret;
}

void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
Expand Down
45 changes: 0 additions & 45 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,6 @@ extern "C" {
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);

LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");

LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
Expand Down Expand Up @@ -423,14 +420,6 @@ extern "C" {
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads),
"use llama_model_apply_lora_from_file instead");

LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
Expand Down Expand Up @@ -606,27 +595,6 @@ extern "C" {
// Decoding
//

// Run the llama inference to obtain the logits and probabilities for the next token(s).
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int32_t n_past),
"use llama_decode() instead");

// Same as llama_eval, but use float matrix input directly.
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int32_t n_past),
"use llama_decode() instead");

// Return batch for single sequence of tokens starting at pos_0
//
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
Expand Down Expand Up @@ -800,13 +768,6 @@ extern "C" {
float * logits_guidance,
float scale);

LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale),
"use llama_sample_apply_guidance() instead");

/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(
struct llama_context * ctx,
Expand Down Expand Up @@ -860,12 +821,6 @@ extern "C" {
llama_token_data_array * candidates,
float temp);

LLAMA_API DEPRECATED(void llama_sample_temperature(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp),
"use llama_sample_temp instead");

/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
Expand Down