Add max-prefill-length argument in distillation dataset generation script

SurbhiJainUSC · SurbhiJainUSC · commit f7f18c72a060 · 2025-05-19T18:19:36.000Z
diff --git a/MaxText/generate_distillation_data.py b/MaxText/generate_distillation_data.py
@@ -24,20 +24,26 @@
     --dataset-path HuggingFaceH4/ultrachat_200k --data-split train_sft --data-columns messages \
     --tokenizer-path deepseek-ai/DeepSeek-V2-Lite-chat \
     --hf-access-token <access token> \
-    --batch-size 1024 --num-batches 100 \
+    --batch-size 1024 --num-batches 10 \
     --num-generations 2 \
-    --max-output-length 128 --max-target-length 256 \
+    --max-prefill-length 256 --max-target-length 2048 \
     --use-chat-template --remove-local-dataset-files \
     upload-to-hf --hf-repo-id <hf repository id>
 
-Running this command executes 100 processing steps.
-In each step, it generates completions for a batch of 40 prompts.
-This results in inference running on 4000 prompts overall, producing 2 samples per prompt.
+Running this command executes 10 processing steps.
+In each step, it generates completions for a batch of 1024 prompts.
+This results in inference running on 10240 prompts overall, producing 2 unique samples per prompt.
+Some prompts may be filtered out if prompt tokens are longer than `max-prefill-length`.
+`max-target-length` is the max length of prompt tokens and expected completion tokens.
+Set `--remove-local-dataset-files` to remove dataset files created locally after uploading to Hugging Face or GCS.
+`upload-to-hf` will upload the dataset to Hugging Face and `upload-to-gcs` will upload the dataset to GCS.
+For more information, check out `python3 -m MaxText.generate_distillation_data --help`.
 Note:
 Make sure to run maxengine server in a new terminal before executing this command. Example command to run maxengine server:
   python3 -m MaxText.maxengine_server MaxText/configs/base.yml \
     model_name=deepseek2-16b tokenizer_path=deepseek-ai/DeepSeek-V2-Lite-chat tokenizer_type=huggingface \
     load_parameters_path=<unscanned checkpoint path> \
+    max_target_length=2048 max_prefill_predict_length=256 \
     per_device_batch_size=10 multi_sampling=True ici_tensor_parallelism=4 \
     decode_sampling_strategy=weighted scan_layers=False
 """
@@ -92,7 +98,7 @@ async def send_request(config, request, stub, tokenizer, progress_bar):  # pylin
 
   outputs = []
   for tokens in completion_tokens:
-    completion = tokenizer.decode(tokens, skip_special_tokens=True)
+    completion = tokenizer.decode(tokens, skip_special_tokens=True).strip()
     outputs.append(
         {
             "prompt": [{"role": "user", "content": prompt}],
@@ -256,9 +262,7 @@ def generate_data(config):  # pylint: disable=redefined-outer-name
   )
   parser.add_argument("--tokenizer-path", type=str, required=True, help="Path to Hugging Face tokenizer.")
   parser.add_argument("--use-chat-template", action="store_true", help="Enable tokenizer to apply a chat template.")
-  parser.add_argument(
-      "--max-output-length", type=int, required=True, help="The maximum completion tokens to generate for a prompt."
-  )
+  parser.add_argument("--max-prefill-length", type=int, default=256, help="The maximum prompt length.")
   parser.add_argument(
       "--max-target-length", type=int, default=2048, help="The maximum prompt length plus the output completion length."
   )
@@ -293,6 +297,6 @@ def generate_data(config):  # pylint: disable=redefined-outer-name
   config = parser.parse_args()
 
   assert (
-      config.max_output_length < config.max_target_length
-  ), "Maximum output length of completion should be less than maximum target length."
+      config.max_prefill_length < config.max_target_length
+  ), "Maximum length of prompt should be less than maximum target length."
   generate_data(config)
diff --git a/MaxText/input_pipeline/_distillation_data_processing.py b/MaxText/input_pipeline/_distillation_data_processing.py
@@ -118,19 +118,21 @@ def filter_dataset(config, dataset, tokenizer):
     prompt = data["prompt"][0]
     actual_completion = data["completion"][0]
 
-    max_output_tokens = min(config.max_output_length, len(tokenizer.encode(actual_completion)))
+    max_output_length = config.max_target_length - config.max_prefill_length
+    max_output_tokens = min(max_output_length, len(tokenizer.encode(actual_completion)))
     if config.use_chat_template:
       message = [{"role": "user", "content": prompt}]
       prompt_token_ids = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=True)
     else:
       prompt_token_ids = tokenizer.encode(prompt)
 
-    # Filter out long prompt sequences
-    if len(prompt_token_ids) + max_output_tokens > config.max_target_length:
+    # Filter out prompt sequences that are longer than max_prefill_length
+    if len(prompt_token_ids) > config.max_prefill_length:
       continue
 
     request = InputRequest(prompt, prompt_token_ids, actual_completion, max_output_tokens)
     filtered_dataset.append(request)
   if len(filtered_dataset) < len(dataset):
+    max_logging.log("Some prompts are longer than `max-prefill-length` and will be filtered out.")
     max_logging.log(f"Filtering reduced dataset batch from {len(dataset)} to {len(filtered_dataset)} samples.")
   return filtered_dataset
diff --git a/MaxText/tests/distillation_data_processing_test.py b/MaxText/tests/distillation_data_processing_test.py
@@ -11,9 +11,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""
-distillation data processing test
-"""
+
+"""Data processing tests for distillation."""
 
 import argparse
 import os
@@ -33,7 +32,7 @@
         {"content": "Why is the sky blue?", "role": "user"},
     ],
     [
-        {"content": "How many days are in a week?", "role": "user"},
+        {"content": "Can you tell me how many days are in a week?", "role": "user"},
     ],
 ]
 
@@ -55,7 +54,7 @@
         {"content": "The sky appears blue due a phenomemon called Rayleigh scattering.", "role": "assistant"},
     ],
     [
-        {"content": "How many days are in a week?", "role": "user"},
+        {"content": "Can you tell me how many days are in a week?", "role": "user"},
         {"content": "There are 7 days in a week.", "role": "assistant"},
     ],
 ]
@@ -64,11 +63,9 @@
 def add_arguments_to_parser(parser):
   parser.add_argument("--data-columns", nargs="+", required=True, help="Columns names that contain relevant data.")
   parser.add_argument("--use-chat-template", action="store_true", help="Enable tokenizer to apply a chat template.")
+  parser.add_argument("--max-prefill-length", type=int, default=16, help="The maximum length for prompt tokens.")
   parser.add_argument(
-      "--max-output-length", type=int, default=8, help="The maximum completion tokens to generate for a prompt."
-  )
-  parser.add_argument(
-      "--max-target-length", type=int, default=16, help="The maximum prompt length plus the output completion length."
+      "--max-target-length", type=int, default=32, help="The maximum prompt length plus the output completion length."
   )
   return parser
 
@@ -83,7 +80,7 @@ def setUpClass(cls):
             "gsutil",
             "cp",
             "-r",
-            "gs://maxtext-dataset/hf/llama2-tokenizer",
+            "gs://maxtext-dataset/hf/llama2-chat-tokenizer",
             os.path.join(os.path.dirname(PKG_DIR), "assets", ""),
         ]
     )
@@ -93,7 +90,7 @@ def setUpClass(cls):
   def setUp(self):
     super().setUp()
     self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-        os.path.join(os.path.dirname(PKG_DIR), "assets", "llama2-tokenizer"),
+        os.path.join(os.path.dirname(PKG_DIR), "assets", "llama2-chat-tokenizer"),
     )
     self.parser = argparse.ArgumentParser()
     self.parser = add_arguments_to_parser(self.parser)
@@ -104,7 +101,7 @@ def test_data_processing_with_messages(self):
 
     processed_dataset = _distillation_data_processing.process_dataset(config, dataset)
 
-    expected_prompts = [["What color is the sky?", "Why is the sky blue?"], ["How many days are in a week?"]]
+    expected_prompts = [["What color is the sky?", "Why is the sky blue?"], ["Can you tell me how many days are in a week?"]]
     expected_completions = [
         ["The sky is blue.", "The sky appears blue due a phenomemon called Rayleigh scattering."],
         ["There are 7 days in a week."],
@@ -121,7 +118,7 @@ def test_data_processing_with_messages(self):
         self.assertEqual(data["completion"][c_idx], completion)
 
   def test_data_filtering_with_messages(self):
-    config = self.parser.parse_args(["--data-columns", "messages"])
+    config = self.parser.parse_args(["--data-columns", "messages", "--use-chat-template"])
     dataset = Dataset.from_dict({"messages": MESSAGES_DATA})
 
     processed_dataset = _distillation_data_processing.process_dataset(config, dataset)
@@ -137,7 +134,7 @@ def test_data_processing_with_prompt_completion(self):
 
     processed_dataset = _distillation_data_processing.process_dataset(config, dataset)
 
-    expected_prompts = [["What color is the sky?", "Why is the sky blue?"], ["How many days are in a week?"]]
+    expected_prompts = [["What color is the sky?", "Why is the sky blue?"], ["Can you tell me how many days are in a week?"]]
     expected_completions = [
         ["The sky is blue.", "The sky appears blue due a phenomemon called Rayleigh scattering."],
         ["There are 7 days in a week."],
@@ -154,7 +151,7 @@ def test_data_processing_with_prompt_completion(self):
         self.assertEqual(data["completion"][c_idx], completion)
 
   def test_data_filtering_with_prompt_completion(self):
-    config = self.parser.parse_args(["--data-columns", "prompt", "completion"])
+    config = self.parser.parse_args(["--data-columns", "prompt", "completion", "--use-chat-template"])
     dataset = Dataset.from_dict({"prompt": PROMPT_DATA, "completion": COMPLETION_DATA})
 
     processed_dataset = _distillation_data_processing.process_dataset(config, dataset)