From 430a0288dc342a9971d47aa34e15e158f7be1678 Mon Sep 17 00:00:00 2001 From: Mikhail Podvitskii Date: Tue, 29 Apr 2025 19:25:24 +0000 Subject: [PATCH 1/2] Refine prompt generation in benchmark dataset Signed-off-by: Mikhail Podvitskii --- benchmarks/benchmark_dataset.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 9c614baf1f0c..469e66113c83 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -315,13 +315,15 @@ def sample( ) vocab_size = tokenizer.vocab_size + num_special_tokens = tokenizer.num_special_tokens_to_add() + real_input_len = input_len - num_special_tokens prefix_token_ids = (np.random.randint( 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(input_len * (1 - range_ratio)) - input_high = int(input_len * (1 + range_ratio)) + input_low = int(real_input_len * (1 - range_ratio)) + input_high = int(real_input_len * (1 + range_ratio)) output_low = int(output_len * (1 - range_ratio)) output_high = int(output_len * (1 + range_ratio)) @@ -344,6 +346,9 @@ def sample( vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq prompt = tokenizer.decode(token_sequence) + re_encoded_sequence = tokenizer.encode( + prompt, add_special_tokens=False)[:input_lens[i]] + prompt = tokenizer.decode(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i]) requests.append( SampleRequest( From c4df42177ba05b94423c6504c816ace9ded305cb Mon Sep 17 00:00:00 2001 From: Mikhail Podvitskii Date: Mon, 5 May 2025 12:47:51 +0000 Subject: [PATCH 2/2] clarifying comment Signed-off-by: Mikhail Podvitskii --- benchmarks/benchmark_dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 469e66113c83..b81c2f8192db 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -346,6 +346,14 @@ def sample( vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq prompt = tokenizer.decode(token_sequence) + # After decoding the prompt we have to encode and decode it again. + # This is done because in some cases N consecutive tokens + # give a string tokenized into != N number of tokens. + # For example for GPT2Tokenizer: + # [6880, 6881] -> ['Ġcalls', 'here'] -> + # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + # To avoid uncontrolled change of the prompt length, + # the encoded sequence is truncated before being decode again. re_encoded_sequence = tokenizer.encode( prompt, add_special_tokens=False)[:input_lens[i]] prompt = tokenizer.decode(re_encoded_sequence)