From 430a0288dc342a9971d47aa34e15e158f7be1678 Mon Sep 17 00:00:00 2001
From: Mikhail Podvitskii <podvitskiymichael@gmail.com>
Date: Tue, 29 Apr 2025 19:25:24 +0000
Subject: [PATCH 1/2] Refine prompt generation in benchmark dataset

Signed-off-by: Mikhail Podvitskii <podvitskiymichael@gmail.com>
---
 benchmarks/benchmark_dataset.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 9c614baf1f0c..469e66113c83 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -315,13 +315,15 @@ def sample(
         )
 
         vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
 
         prefix_token_ids = (np.random.randint(
             0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
 
         # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(input_len * (1 - range_ratio))
-        input_high = int(input_len * (1 + range_ratio))
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
         output_low = int(output_len * (1 - range_ratio))
         output_high = int(output_len * (1 + range_ratio))
 
@@ -344,6 +346,9 @@ def sample(
                          vocab_size).tolist()
             token_sequence = prefix_token_ids + inner_seq
             prompt = tokenizer.decode(token_sequence)
+            re_encoded_sequence = tokenizer.encode(
+                prompt, add_special_tokens=False)[:input_lens[i]]
+            prompt = tokenizer.decode(re_encoded_sequence)
             total_input_len = prefix_len + int(input_lens[i])
             requests.append(
                 SampleRequest(

From c4df42177ba05b94423c6504c816ace9ded305cb Mon Sep 17 00:00:00 2001
From: Mikhail Podvitskii <podvitskiymichael@gmail.com>
Date: Mon, 5 May 2025 12:47:51 +0000
Subject: [PATCH 2/2] clarifying comment

Signed-off-by: Mikhail Podvitskii <podvitskiymichael@gmail.com>
---
 benchmarks/benchmark_dataset.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 469e66113c83..b81c2f8192db 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -346,6 +346,14 @@ def sample(
                          vocab_size).tolist()
             token_sequence = prefix_token_ids + inner_seq
             prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
             re_encoded_sequence = tokenizer.encode(
                 prompt, add_special_tokens=False)[:input_lens[i]]
             prompt = tokenizer.decode(re_encoded_sequence)