Improve beam_state by adding+using struct beam_view.

mattpulver · mattpulver · commit 91d65a8bf0ec · 2023-08-06T13:43:10.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -27,6 +27,24 @@
 #include <signal.h>
 #endif
 
+// Used for debugging to print out beam tokens.
+struct ostream_beam_view {
+    llama_context* ctx;
+    beam_view bv;
+};
+std::ostream& operator<<(std::ostream& os, ostream_beam_view const& obv) {
+    os << "p(" << obv.bv.p << ") eos(" << std::boolalpha << obv.bv.eos() << ") tokens(";
+    for (size_t i=0 ; i<obv.bv.n_tokens ; ++i) {
+        os << llama_token_to_str(obv.ctx, obv.bv.tokens[i]);
+    }
+    return os << ')';
+}
+
+// Put here anything you want back in beam_search_callback().
+struct beam_search_callback_state {
+    llama_context* ctx;
+    std::vector<llama_token>* response;
+};
 
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
@@ -35,22 +53,27 @@
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_state.
 beam_search_control beam_search_callback(void* callback_state, beams_state const beams_state) {
+    auto const state = *static_cast<beam_search_callback_state*>(callback_state);
     printf(",");  // Show progress
     if (size_t const n = beams_state.common_prefix_length) {
-        auto* response = static_cast<std::vector<llama_token>*>(callback_state);
-        response->resize(response->size() + n);
+        state.response->resize(state.response->size() + n);
         assert(0u < beams_state.n_beams);
-        std::copy(beams_state.beams[0], beams_state.beams[0] + n, response->end() - n);
+        llama_token const* tokens = beams_state.beam_views[0].tokens;
+        std::copy(tokens, tokens + n, state.response->end() - n);
         printf("%lu", n);
     }
     fflush(stdout);
-#if 0 // DEBUG: print current beams for this iteration
-            std::cout << "\n\nCurrent beams:\n";
-            for (size_t j=0 ; j < beams.size() ; ++j) {
-                std::cout << "beams["<<j<<"]: " << ostream_beam{ctx,beams[j]} << std::endl;
-            }
+#if 1 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
 #endif
-    return { beams_state.n_beams, false };  // Continue beam search.
+    beam_search_control control {
+        beams_state.n_beams,  // = collapse_to. Any index out of range means do not collapse beams.
+        false                 // = stop. Don't stop beam search.
+    };
+    return control;
 }
 
 int main(int argc, char ** argv)
@@ -140,9 +163,10 @@ int main(int argc, char ** argv)
     n_past += tokens_list.size();
 
     std::vector<llama_token> response;
+    beam_search_callback_state callback_state{ctx, &response};
     size_t const beam_width = static_cast<size_t>(params.n_beams);
     int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &response, beam_width, n_past, n_predict, params.n_threads);
+    llama_beam_search(ctx, beam_search_callback, &callback_state, beam_width, n_past, n_predict, params.n_threads);
 
     printf("\n\n");
     for (llama_token const token_id : response) {
diff --git a/llama.cpp b/llama.cpp
@@ -53,8 +53,6 @@
 #include <sstream>
 #include <numeric>
 
-#include <iostream>
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -2880,7 +2878,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 
 struct beam {
     std::vector<llama_token> tokens;
-    float p;  // Cumulative beam probability (renormalized with each token)
+    float p;  // Cumulative beam probability (renormalized relative to all beams)
     // end-of-sentence
     bool eos() const { return !tokens.empty() && tokens.back() == llama_token_eos(); }
     // Shift off first n tokens and discard them.
@@ -2890,19 +2888,6 @@ struct beam {
     }
 };
 
-// Used for debugging to print out beam tokens.
-struct ostream_beam {
-    llama_context* ctx;
-    beam& b;
-};
-std::ostream& operator<<(std::ostream& os, ostream_beam const& osb) {
-    os << "p(" << osb.b.p << ") eos(" << std::boolalpha << osb.b.eos() << ") tokens(";
-    for (llama_token const token_id : osb.b.tokens) {
-        os << llama_token_to_str(osb.ctx, token_id);
-    }
-    return os << ')';
-}
-
 // A struct for calculating logit-related info.
 struct logit_info {
     float const* const logits;
@@ -2962,18 +2947,16 @@ struct beam_search {
     // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
     bool common_prefix_evaluated;
 
-    // Memory used by beam_state
-    std::vector<size_t> beam_lengths;
-    std::vector<llama_token const*> beam_ptrs;
+    // Temporary memory used by beams_state to pass back via callback.
+    std::vector<beam_view> beam_views;
 
     beam_search(llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
       , beam_width(beam_width)
       , n_past(n_past)
       , n_predict(n_predict)
       , n_threads(n_threads)
-      , beam_lengths(beam_width)
-      , beam_ptrs(beam_width) {
+      , beam_views(beam_width) {
         beams.reserve(beam_width);
         next_beams.reserve(beam_width);
     }
@@ -3074,11 +3057,10 @@ struct beam_search {
     // Side effect: set common_prefix_length = find_common_prefix_length();
     beams_state get_beams_state(bool const last_call) {
         for (size_t i=0 ; i<beams.size() ; ++i) {
-            beam_lengths[i] = beams[i].tokens.size();
-            beam_ptrs[i] = beams[i].tokens.data();
+            beam_views[i] = beam_view{beams[i].tokens.data(), beams[i].tokens.size(), beams[i].p};
         }
         common_prefix_length = find_common_prefix_length();
-        return {beams.size(), beam_lengths.data(), beam_ptrs.data(), common_prefix_length, last_call};
+        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
     }
 
     // Loop:
diff --git a/llama.h b/llama.h
@@ -443,16 +443,24 @@ extern "C" {
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
 
+    // Lightweight view of a beam
+    struct beam_view {
+        llama_token const* tokens;
+        size_t n_tokens;
+        float p;  // Cumulative beam probability (renormalized relative to all beams)
+        // end-of-sentence
+        bool eos() const { return n_tokens && tokens[n_tokens-1u] == llama_token_eos(); }
+    };
+
     // Passed to beam_search_callback function.
     // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
     // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
     // These pointers are valid only during the synchronous callback, so should not be saved.
     struct beams_state {
-        size_t n_beams;                   // Number of elements in beam_lengths[] and beams[].
-        size_t const* beam_lengths;       // Length of each beam.
-        llama_token const* const* beams;  // Current tokens in each beam.
-        size_t common_prefix_length;      // Current max length of prefix tokens shared by all beams.
-        bool last_call;                   // True iff this is the last callback invocation.
+        beam_view* beam_views;        // View of each beam.
+        size_t n_beams;               // Number of elements in beam_views[].
+        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+        bool last_call;               // True iff this is the last callback invocation.
     };
     // Must be returned by beam_search_callback function.
     struct beam_search_control {