allow for bf16 training

danielvegamyhre · danielvegamyhre · commit d63c7fa95209 · 2025-02-28T15:22:47.000-08:00
diff --git a/torchao/float8/benchmarking/float8_training_benchmark.sh b/torchao/float8/benchmarking/float8_training_benchmark.sh
@@ -3,7 +3,6 @@
 # with the given parameters,
 
 # script arguments
-RECIPE=${RECIPE:-"tensorwise"}
 BATCH_SIZE=${BATCH_SIZE:-1}
 STEPS=${STEPS:-100}
 
@@ -15,26 +14,32 @@ if [ -z "${TORCHTITAN_ROOT}" ]; then
   echo "Error: TORCHTITAN environment variable is not set. Please set it before running this script."
   echo "Usage: TORCHTITAN_ROOT=<directory> ./float8_training_benchmark.sh"
   echo "Optional parameters configurable via environment variables:"
-  echo " * RECIPE: rowwise|tensorwise. defaults to tensorwise."
+  echo " * FLOAT8_RECIPE: "rowwise" or "tensorwise". if set, use float8 training with the specified recipe. otherwise, use bf16 mixed precision training."
   echo " * BATCH_SIZE: defaults to 1."
   echo " * STEPS: defaults to 100."
   exit 1
 fi
 
 # validate recipe name
-if [ "$RECIPE" != "rowwise" ] && [ "$RECIPE" != "tensorwise" ]; then
-    echo "Error: RECIPE must be 'rowwise' or 'tensorwise'"
-    exit 1
+if [ -n "${FLOAT8_RECIPE}" ]; then
+  if [ "$FLOAT8_RECIPE" != "rowwise" ] && [ "$FLOAT8_RECIPE" != "tensorwise" ]; then
+      echo "Error: RECIPE must be 'rowwise' or 'tensorwise'"
+      exit 1
+  fi
+  FLOAT8_ARGS="--model.converters="float8" --float8.recipe_name=${FLOAT8_RECIPE}"
 fi
 
+
 # remember current directory to return to it later
 original_dir=$(pwd)
 
 # navigate to torchtitan root dir
 cd ${TORCHTITAN_ROOT}
 
+echo "float8 args: ${FLOAT8_ARGS}"
+
 # run the command with the specified arguments
-CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=$BATCH_SIZE --training.compile --model.converters="float8" --float8.recipe_name=$RECIPE 2>&1 | tee ${LOG_FILE}
+CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=${BATCH_SIZE} --training.compile ${FLOAT8_ARGS} 2>&1 | tee ${LOG_FILE}
 
 # return to original working directory
 cd $original_dir