Remove layer norm from the default quantizer, add one that has it

mcremon-meta · facebook-github-bot · commit 817c4596ae27 · 2025-04-14T17:53:15.000-07:00
Summary:
Layer norm is not performing great in quantized mode, and is currently using a split scheme (weights are quantized, activations are not). In most cases, it's actually much faster to keep it fp32, so this diff removes it from the default quantizer.
We add a CadenceWithLayerNormQuantizer for easy access to the current behavior, which can be good in some cases (mostly if quantizing layer norm will help extend the quantized liveness).

Differential Revision: D72941790
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -193,7 +193,6 @@ def get_cadence_default_quantizers() -> List[Quantizer]:
         CadenceAtenQuantizer(BmmPattern(), qconfig_A8W8),
         CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8W8sym),
         CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8W8sym),
-        CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8W8),
         CadenceAtenQuantizer(LinearPattern(), qconfig_A8W8),
         CadenceAtenQuantizer(MatmulPattern(), qconfig_A8W8),
         CadenceAtenQuantizer(ReluPattern0(), qconfig_A8W8),
@@ -236,9 +235,21 @@ def __init__(
         super().__init__([])
 
 
+class CadenceWithLayerNormQuantizer(CadenceQuantizer):
+    """
+    Quantizer including layer norm
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8W8))
+        super().__init__(quantizers)
+
+
 class CadenceWakeWordQuantizer(CadenceQuantizer):
     """
-    Quantizer for WakeWord, including add
+    Quantizer for WakeWord, including add and cat
     """
 
     def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: