add fp8 support

drisspg · drisspg · commit aba1a1055589 · 2024-07-03T17:09:46.000-07:00
diff --git a/scripts/hf_eval.py b/scripts/hf_eval.py
@@ -56,6 +56,19 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
         change_linear_weights_to_int4_woqtensors(model.to(device=device))
     elif quantization == "autoquant":
         model = autoquant(model.to(device=device))
+    elif quantization == "fp8":
+        from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear
+        from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
+        model.to(device)
+        swap_linear_with_float8_linear(
+            model,
+            Float8DynamicLinear,
+            from_float_kwargs={
+                "pre_quantize_weight": True,
+            },
+        )
+        pass  # no quantization applied, model is already on device and precision dtype.
+
     with torch.no_grad():
         result = evaluate(
             HFLM(
@@ -78,7 +91,7 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
     parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
     parser.add_argument('--precision', type=lambda x: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
     parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
-    parser.add_argument('-q', '--quantization', default = "None", choices=["int8dq", "int8wo", "int4wo","autoquant", "None"], help='Which quantization technique to apply')
+    parser.add_argument('-q', '--quantization', default = "None", choices=["int8dq", "int8wo", "int4wo","autoquant", "fp8", "None"], help='Which quantization technique to apply')
     parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
     parser.add_argument('--batch_size', type=int, default=1, help='Batch size to use for evaluation, note int8wo and int4wo work best with small batchsizes, int8dq works better with large batchsizes')
     parser.add_argument('--max_length', type=int, default=None, help='Length of text to process at one time')