@@ -2163,17 +2163,10 @@ struct server_context {
2163
2163
GGML_ASSERT (slot.n_prompt_tokens < slot.n_ctx );
2164
2164
}
2165
2165
2166
- common_sampler_reset (slot.smpl );
2167
-
2168
2166
if (slot.params .cache_prompt ) {
2169
2167
// reuse any previously computed tokens that are common with the new prompt
2170
2168
slot.n_past = longest_common_prefix (slot.cache_tokens , prompt_tokens);
2171
2169
2172
- // push the prompt into the sampling context (do not apply grammar)
2173
- for (int i = 0 ; i < slot.n_past ; ++i) {
2174
- common_sampler_accept (slot.smpl , slot.cache_tokens [i], false );
2175
- }
2176
-
2177
2170
// reuse chunks from the cached prompt by shifting their KV cache in the new position
2178
2171
if (params.n_cache_reuse > 0 ) {
2179
2172
size_t head_c = slot.n_past ; // cache
@@ -2206,8 +2199,6 @@ struct server_context {
2206
2199
for (size_t i = 0 ; i < n_match; i++) {
2207
2200
slot.cache_tokens [head_p + i] = slot.cache_tokens [head_c + i];
2208
2201
2209
- common_sampler_accept (slot.smpl , slot.cache_tokens [head_p + i], false );
2210
-
2211
2202
slot.n_past ++;
2212
2203
}
2213
2204
@@ -2259,8 +2250,6 @@ struct server_context {
2259
2250
2260
2251
// there is no common part left
2261
2252
slot.n_past = 0 ;
2262
-
2263
- common_sampler_reset (slot.smpl );
2264
2253
}
2265
2254
2266
2255
SLT_INF (slot, " kv cache rm [%d, end)\n " , slot.n_past );
@@ -2288,6 +2277,13 @@ struct server_context {
2288
2277
2289
2278
GGML_ASSERT (batch.n_tokens > 0 );
2290
2279
2280
+ common_sampler_reset (slot.smpl );
2281
+
2282
+ // Process all prompt tokens through sampler system
2283
+ for (int i = 0 ; i < slot.n_prompt_tokens ; ++i) {
2284
+ common_sampler_accept (slot.smpl , prompt_tokens[i], false );
2285
+ }
2286
+
2291
2287
// extract the logits only for the last token
2292
2288
batch.logits [batch.n_tokens - 1 ] = true ;
2293
2289
0 commit comments