diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 9e39b84c0b..d6192d33aa 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -170,29 +170,35 @@ def assert_dequantized_grouped_scaled_tensor(
     ("quick_gelu", "linear"),
     ("squared_relu",),
     ("squared_relu", "linear"),
+    ("clamped_silu", "clamped_linear"),
 ]
 
 ACTIVATION_TYPES = {
     "L0": [
         ("gelu",),
         ("gelu", "linear"),
+        ("clamped_silu", "clamped_linear"),
     ],
     "L2": ALL_ACTIVATION_TYPES,
 }
 
 
 class TestActivation:
-    def ref_act(self, x, activation_type):
-        return _jax_act_lu(x, activation_type).data
+    def ref_act(self, x, activation_type, act_params):
+        return _jax_act_lu(x, activation_type, act_params=act_params).data
 
-    def value_n_grad_ref_func(self, x, activation_type):
+    def value_n_grad_ref_func(self, x, activation_type, act_params):
         jitted_reference = jit(
-            value_and_grad(lambda out: jnp.mean(self.ref_act(out, activation_type)), (0,))
+            value_and_grad(
+                lambda out: jnp.mean(self.ref_act(out, activation_type, act_params)), (0,)
+            )
         )
         return jitted_reference(x)
 
-    def primitive_func(self, inputs, activation_type, quantizer):
-        out = activation(inputs, activation_type=activation_type, quantizer=quantizer)
+    def primitive_func(self, inputs, activation_type, quantizer, act_params):
+        out = activation(
+            inputs, activation_type=activation_type, quantizer=quantizer, act_params=act_params
+        )
         return jnp.mean(out)
 
     @pytest_parametrize_wrapper("shape", ALL_ACTIVATION_SHAPES)
@@ -209,12 +215,20 @@ def test_act_grad(self, shape, activation_type):
         x = jnp.repeat(x, len(activation_type), axis=-2)
 
         value_n_grad_primitive_func = jit(
-            value_and_grad(self.primitive_func, (0,)), static_argnums=(1,)
+            value_and_grad(self.primitive_func, (0,)), static_argnums=(1, 3)
         )
-
-        prim_out, (prim_grad,) = value_n_grad_primitive_func(x, activation_type, None)
-        ref_out, (ref_grad,) = self.value_n_grad_ref_func(x, activation_type)
-
+        act_args = (
+            {"limit": 0.75, "alpha": 1.702}
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else {}
+        )
+        act_params = (
+            tex.activation.ActivationParams.create(activation_type=activation_type, **act_args)
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else None
+        )
+        prim_out, (prim_grad,) = value_n_grad_primitive_func(x, activation_type, None, act_params)
+        ref_out, (ref_grad,) = self.value_n_grad_ref_func(x, activation_type, act_params)
         assert_allclose(prim_out, ref_out, dtype=x.dtype)
         assert_allclose(prim_grad, ref_grad, dtype=x.dtype)
 
@@ -234,7 +248,8 @@ def test_act_grad_with_tensor_scaling_fp8(
         self.activation_type = activation_type
 
         value_n_grad_primitive_func = jit(
-            value_and_grad(self.primitive_func, (0,)), static_argnums=(1,)
+            value_and_grad(self.primitive_func, (0,)),
+            static_argnums=(1, 3),
         )
 
         quantizer = QuantizerFactory.create(
@@ -242,9 +257,21 @@ def test_act_grad_with_tensor_scaling_fp8(
             q_dtype=output_type,
             q_layout=QuantizeLayout.ROWWISE,
         )
+        act_args = (
+            {"limit": 0.75, "alpha": 1.702}
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else {}
+        )
 
-        prim_out, (prim_grad,) = value_n_grad_primitive_func(x, activation_type, quantizer)
-        ref_out, (ref_grad,) = self.value_n_grad_ref_func(x, activation_type)
+        act_params = (
+            tex.activation.ActivationParams.create(activation_type=activation_type, **act_args)
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else None
+        )
+        prim_out, (prim_grad,) = value_n_grad_primitive_func(
+            x, activation_type, quantizer, act_params
+        )
+        ref_out, (ref_grad,) = self.value_n_grad_ref_func(x, activation_type, act_params)
 
         assert_allclose(prim_out, ref_out, dtype=output_type)
         assert_allclose(prim_grad, ref_grad, dtype=output_type)
@@ -273,10 +300,18 @@ def test_act_forward_with_tensor_scaling_fp8(
             q_dtype=output_type,
             q_layout=q_layout,
         )
-
-        te_output = tex.act_lu(x, activation_type, te_quantizer)
-        jax_output = _jax_act_lu(x, activation_type, jax_quantizer)
-
+        act_args = (
+            {"limit": 0.75, "alpha": 1.702}
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else {}
+        )
+        act_params = (
+            tex.activation.ActivationParams.create(activation_type=activation_type, **act_args)
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else None
+        )
+        te_output = tex.act_lu(x, activation_type, te_quantizer, act_params)
+        jax_output = _jax_act_lu(x, activation_type, jax_quantizer, act_params)
         assert_bitwise_scaled_tensors(te_output, jax_output)
 
     @pytest.mark.skipif(not is_mxfp8_supported, reason=mxfp8_unsupported_reason)
@@ -296,10 +331,18 @@ def test_act_forward_with_block_scaling_fp8(
         quantizer = QuantizerFactory.create(
             scaling_mode=ScalingMode.MXFP8_1D_SCALING, q_dtype=output_type, q_layout=q_layout
         )
-
-        output = tex.act_lu(x, activation_type, quantizer)
-        ref_out = self.ref_act(x, activation_type)
-
+        act_args = (
+            {"limit": 0.75, "alpha": 1.702}
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else {}
+        )
+        act_params = (
+            tex.activation.ActivationParams.create(activation_type=activation_type, **act_args)
+            if activation_type == ("clamped_silu", "clamped_linear")
+            else None
+        )
+        output = tex.act_lu(x, activation_type, quantizer, act_params)
+        ref_out = self.ref_act(x, activation_type, act_params)
         assert_dequantized_scaled_tensor(output, ref_out)
 
 
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
index bb07e87d98..8b4f671fe0 100644
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
@@ -1707,6 +1707,78 @@ def test_swiglu(
         torch.testing.assert_close(y_test, y_ref, **tols)
         torch.testing.assert_close(dx_test, x_ref.grad, **tols)
 
+    @pytest.mark.parametrize("dtype", _dtypes)
+    @pytest.mark.parametrize("quantization", _quantization_list)
+    @pytest.mark.parametrize("quantize_forward", (False, True))
+    @pytest.mark.parametrize("quantize_backward", (False, True))
+    def test_clamped_swiglu(
+        self,
+        *,
+        out_shape: Iterable[int] = (32, 32),
+        dtype: torch.dtype,
+        device: torch.device = "cuda",
+        quantization: Optional[str],
+        quantize_forward: bool,
+        quantize_backward: bool,
+        limit: float = 0.75,
+        alpha: float = 1.702,
+    ):
+        # Test SwiGLU variant used in GPT OSS.
+        # Tensor dimensions
+        in_shape = list(out_shape)
+        in_shape[-1] *= 2
+
+        # Skip invalid configurations
+        quantized_compute = quantization is not None
+        if not quantized_compute and (quantize_forward or quantize_backward):
+            pytest.skip("Quantization scheme has not been provided")
+        maybe_skip_quantization(quantization, dims=in_shape, device=device)
+
+        # Random data
+        x_ref, x_test = make_reference_and_test_tensors(
+            in_shape,
+            test_dtype=dtype,
+            test_device=device,
+        )
+        dy_ref, dy_test = make_reference_and_test_tensors(
+            out_shape,
+            test_dtype=dtype,
+            test_device=device,
+            requires_grad=False,
+        )
+
+        # Plain PyTorch implementation
+        x_glu, x_linear = x_ref.chunk(2, dim=-1)
+        x_glu = x_glu.clamp(min=None, max=limit)
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+        out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+        y_ref = out_glu * (x_linear + 1)
+        y_ref.backward(dy_ref)
+
+        # Implementation with fusible operation
+        recipe = make_recipe(quantization)
+
+        forward = te_ops.Sequential(
+            te_ops.Quantize(forward=False, backward=quantize_backward),
+            te_ops.ClampedSwiGLU(limit=limit, alpha=alpha),
+            te_ops.Quantize(forward=quantize_forward, backward=False),
+        )
+        with te.fp8_autocast(enabled=quantized_compute, fp8_recipe=recipe):
+            y_test = forward(x_test)
+
+        y_test.backward(dy_test)
+
+        # Expected numerical error
+        tols = dtype_tols(dtype)
+        if quantized_compute:
+            tols = dtype_tols(tex.DType.kFloat8E4M3)
+
+        # Check results
+        y_test = y_test.to(dtype=torch.float64, device="cpu")
+        dx_test = x_test.grad.to(dtype=torch.float64, device="cpu")
+        torch.testing.assert_close(y_test, y_ref, **tols)
+        torch.testing.assert_close(dx_test, x_ref.grad, **tols)
+
     @pytest.mark.parametrize("scale", (1, 0, -2.5, 3.5))
     @pytest.mark.parametrize("shape", ((), (1, 13), (4, 4, 2)))
     @pytest.mark.parametrize("dtype", _dtypes)
diff --git a/transformer_engine/common/activation/activation_template.h b/transformer_engine/common/activation/activation_template.h
index 67f173a4ab..1d9a3fb43c 100644
--- a/transformer_engine/common/activation/activation_template.h
+++ b/transformer_engine/common/activation/activation_template.h
@@ -51,22 +51,20 @@ void dact_fn(const NVTETensor grad, const NVTETensor input, NVTETensor output,
 }
 
 template <typename ComputeType, typename Param, ComputeType (*ActOP)(ComputeType, const Param &)>
-void gated_act_fn(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
+void gated_act_fn(const NVTETensor input, NVTETensor output, Param &p, cudaStream_t stream) {
   using namespace detail;
   constexpr bool IS_DGATED = false;
   constexpr NVTETensor grad = nullptr;
-
-  quantize_gated_helper<IS_DGATED, Param, ActOP, nullptr>(grad, input, output, stream);
+  quantize_gated_helper<IS_DGATED, Param, ActOP, nullptr>(grad, input, output, p, stream);
 }
 
 template <typename ComputeType, typename Param, ComputeType (*ActOP)(ComputeType, const Param &),
           ComputeType (*DActOP)(ComputeType, const Param &)>
-void dgated_act_fn(const NVTETensor grad, const NVTETensor input, NVTETensor output,
+void dgated_act_fn(const NVTETensor grad, const NVTETensor input, NVTETensor output, Param &p,
                    cudaStream_t stream) {
   using namespace detail;
   constexpr bool IS_DGATED = true;
-
-  quantize_gated_helper<IS_DGATED, Param, ActOP, DActOP>(grad, input, output, stream);
+  quantize_gated_helper<IS_DGATED, Param, ActOP, DActOP>(grad, input, output, p, stream);
 }
 
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/activation/gelu.cu b/transformer_engine/common/activation/gelu.cu
index 0cf43007a7..4949ba5906 100644
--- a/transformer_engine/common/activation/gelu.cu
+++ b/transformer_engine/common/activation/gelu.cu
@@ -23,14 +23,16 @@ void nvte_dgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output
 void nvte_geglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
   NVTE_API_CALL(nvte_geglu);
   using namespace transformer_engine;
-  gated_act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, stream);
+  Empty e = {};
+  gated_act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, e, stream);
 }
 
 void nvte_dgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                  cudaStream_t stream) {
   NVTE_API_CALL(nvte_dgeglu);
   using namespace transformer_engine;
-  dgated_act_fn<fp32, Empty, gelu<fp32, fp32>, dgelu<fp32, fp32>>(grad, input, output, stream);
+  Empty e = {};
+  dgated_act_fn<fp32, Empty, gelu<fp32, fp32>, dgelu<fp32, fp32>>(grad, input, output, e, stream);
 }
 
 void nvte_qgelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
@@ -49,12 +51,14 @@ void nvte_dqgelu(const NVTETensor grad, const NVTETensor input, NVTETensor outpu
 void nvte_qgeglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
   NVTE_API_CALL(nvte_qgeglu);
   using namespace transformer_engine;
-  gated_act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, stream);
+  Empty e = {};
+  gated_act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, e, stream);
 }
 
 void nvte_dqgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                   cudaStream_t stream) {
   NVTE_API_CALL(nvte_dqgeglu);
   using namespace transformer_engine;
-  dgated_act_fn<fp32, Empty, qgelu<fp32, fp32>, dqgelu<fp32, fp32>>(grad, input, output, stream);
+  Empty e = {};
+  dgated_act_fn<fp32, Empty, qgelu<fp32, fp32>, dqgelu<fp32, fp32>>(grad, input, output, e, stream);
 }
diff --git a/transformer_engine/common/activation/relu.cu b/transformer_engine/common/activation/relu.cu
index a794b7315f..c74fc6eee9 100644
--- a/transformer_engine/common/activation/relu.cu
+++ b/transformer_engine/common/activation/relu.cu
@@ -23,14 +23,16 @@ void nvte_drelu(const NVTETensor grad, const NVTETensor input, NVTETensor output
 void nvte_reglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
   NVTE_API_CALL(nvte_reglu);
   using namespace transformer_engine;
-  gated_act_fn<fp32, Empty, relu<fp32, fp32>>(input, output, stream);
+  Empty e = {};
+  gated_act_fn<fp32, Empty, relu<fp32, fp32>>(input, output, e, stream);
 }
 
 void nvte_dreglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                  cudaStream_t stream) {
   NVTE_API_CALL(nvte_dreglu);
   using namespace transformer_engine;
-  dgated_act_fn<fp32, Empty, relu<fp32, fp32>, drelu<fp32, fp32>>(grad, input, output, stream);
+  Empty e = {};
+  dgated_act_fn<fp32, Empty, relu<fp32, fp32>, drelu<fp32, fp32>>(grad, input, output, e, stream);
 }
 
 void nvte_srelu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
@@ -49,12 +51,14 @@ void nvte_dsrelu(const NVTETensor grad, const NVTETensor input, NVTETensor outpu
 void nvte_sreglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
   NVTE_API_CALL(nvte_sreglu);
   using namespace transformer_engine;
-  gated_act_fn<fp32, Empty, srelu<fp32, fp32>>(input, output, stream);
+  Empty e = {};
+  gated_act_fn<fp32, Empty, srelu<fp32, fp32>>(input, output, e, stream);
 }
 
 void nvte_dsreglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                   cudaStream_t stream) {
   NVTE_API_CALL(nvte_dsreglu);
   using namespace transformer_engine;
-  dgated_act_fn<fp32, Empty, srelu<fp32, fp32>, dsrelu<fp32, fp32>>(grad, input, output, stream);
+  Empty e = {};
+  dgated_act_fn<fp32, Empty, srelu<fp32, fp32>, dsrelu<fp32, fp32>>(grad, input, output, e, stream);
 }
diff --git a/transformer_engine/common/activation/swiglu.cu b/transformer_engine/common/activation/swiglu.cu
index 8194964745..cafc48abba 100644
--- a/transformer_engine/common/activation/swiglu.cu
+++ b/transformer_engine/common/activation/swiglu.cu
@@ -23,12 +23,31 @@ void nvte_dsilu(const NVTETensor grad, const NVTETensor input, NVTETensor output
 void nvte_swiglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
   NVTE_API_CALL(nvte_swiglu);
   using namespace transformer_engine;
-  gated_act_fn<fp32, Empty, silu<fp32, fp32>>(input, output, stream);
+  Empty e = {};
+  gated_act_fn<fp32, Empty, silu<fp32, fp32>>(input, output, e, stream);
 }
 
 void nvte_dswiglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                   cudaStream_t stream) {
   NVTE_API_CALL(nvte_dswiglu);
   using namespace transformer_engine;
-  dgated_act_fn<fp32, Empty, silu<fp32, fp32>, dsilu<fp32, fp32>>(grad, input, output, stream);
+  Empty e = {};
+  dgated_act_fn<fp32, Empty, silu<fp32, fp32>, dsilu<fp32, fp32>>(grad, input, output, e, stream);
+}
+
+void nvte_clamped_swiglu(const NVTETensor input, NVTETensor output, float limit, float alpha,
+                         cudaStream_t stream) {
+  NVTE_API_CALL(nvte_clamped_swiglu);
+  using namespace transformer_engine;
+  ClampedSwiGLUParam param = {limit, alpha};
+  gated_act_fn<fp32, ClampedSwiGLUParam, clamped_silu<fp32, fp32>>(input, output, param, stream);
+}
+
+void nvte_clamped_dswiglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
+                          float limit, float alpha, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_clamped_dswiglu);
+  using namespace transformer_engine;
+  ClampedSwiGLUParam param = {limit, alpha};
+  dgated_act_fn<fp32, ClampedSwiGLUParam, clamped_silu<fp32, fp32>, clamped_dsilu<fp32, fp32>>(
+      grad, input, output, param, stream);
 }
diff --git a/transformer_engine/common/include/transformer_engine/activation.h b/transformer_engine/common/include/transformer_engine/activation.h
index 49029ed588..4e48088586 100644
--- a/transformer_engine/common/include/transformer_engine/activation.h
+++ b/transformer_engine/common/include/transformer_engine/activation.h
@@ -39,6 +39,7 @@ enum class NVTE_Activation_Type {
   QGEGLU,
   SRELU,
   SREGLU,
+  CLAMPED_SWIGLU
 };
 
 /*! \brief Computes the GeLU activation of the input.
@@ -173,6 +174,26 @@ void nvte_geglu(const NVTETensor input, NVTETensor output, cudaStream_t stream);
  */
 void nvte_swiglu(const NVTETensor input, NVTETensor output, cudaStream_t stream);
 
+/*! \brief Computes the gated Swish activation of the input used in GPT OSS.
+ *
+ *        See https://github.com/openai/gpt-oss/blob/a0a84273e9e0c14a233cb9befdfd159c2bcfa6cd/gpt_oss/torch/model.py#L250
+ *        This Gated activation has two differences compared to the original SwiGLU
+ *           1. Both gate and pre-activations are clipped based on parameter limit.
+ *           2. Activation uses sigmoid(alpha * x) instead of sigmoid(x) used in Swish activation inspired
+ *           by original GELU paper https://arxiv.org/pdf/1606.08415
+ *         If the scaling mode of the output tensor is set to NVTE_MXFP8_1D_SCALING,
+ *         the block quantization (MXFP8) of the specified shape of the block will be used.
+ *
+ *  \param[in]     input     Input tensor of shape [N, H * 2].
+ *  \param[in,out] output    Output tensor of shape [N, H].
+ *                           It computes Act(input[N, :H]) x input[N, H:]
+ *  \param[in]     limit     Clipping limits for gate and pre-activation.
+ *  \param[in]     alpha     Scaling factor for the sigmoid function used in the activation.
+ *  \param[in]     stream    CUDA stream used for the operation.
+ */
+void nvte_clamped_swiglu(const NVTETensor input, NVTETensor output, float limit, float alpha,
+                         cudaStream_t stream);
+
 /*! \brief Computes the gated ReLU activation of the input.
  *         If the scaling mode of the output tensor is set to NVTE_MXFP8_1D_SCALING,
  *         the block quantization (MXFP8) of the specified shape of the block will be used.
@@ -230,6 +251,26 @@ void nvte_dgeglu(const NVTETensor grad, const NVTETensor input, NVTETensor outpu
 void nvte_dswiglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                   cudaStream_t stream);
 
+/*! \brief Computes the gradient of gated Swish activation of the input used in GPT OSS.
+ *
+ *        https://github.com/openai/gpt-oss/blob/a0a84273e9e0c14a233cb9befdfd159c2bcfa6cd/gpt_oss/torch/model.py#L250
+ *        This activation has two differences compared to the original SwiGLU
+ *           1. Both gate and pre-activations are clipped based on parameter limit.
+ *           2. Activation uses sigmoid(alpha * x) instead of sigmoid(x) used in Swish activation inspired
+ *           by original GELU paper https://arxiv.org/pdf/1606.08415
+ *         If the scaling mode of the output tensor is set to NVTE_MXFP8_1D_SCALING,
+ *         the block quantization (MXFP8) of the specified shape of the block will be used.
+ *
+ *  \param[in]     grad      Incoming gradient of shape [N, H].
+ *  \param[in]     input     Forward input tensor of shape [N, H * 2].
+ *  \param[in,out] output    Outgoing gradient of shape [N, H * 2].
+ *  \param[in]     limit     Clipping limits for gate and pre-activation.
+ *  \param[in]     alpha     Scaling factor for the sigmoid function used in the activation.
+ *  \param[in]     stream    CUDA stream used for the operation.
+ */
+void nvte_clamped_dswiglu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
+                          float limit, float alpha, cudaStream_t stream);
+
 /*! \brief Computes the gated ReLU activation gradient.
  *         If the scaling mode of the output tensor is set to NVTE_MXFP8_1D_SCALING,
  *         the block quantization (MXFP8) of the specified shape of the block will be used.
diff --git a/transformer_engine/common/util/cast_gated_kernels.cuh b/transformer_engine/common/util/cast_gated_kernels.cuh
index 50ff82d85f..1c1578ac53 100644
--- a/transformer_engine/common/util/cast_gated_kernels.cuh
+++ b/transformer_engine/common/util/cast_gated_kernels.cuh
@@ -55,7 +55,8 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
                           const __grid_constant__ CUtensorMap tensor_map_output_act,
                           const __grid_constant__ CUtensorMap tensor_map_output_gate,
                           float *const amax_ptr, float *const scale_inv_ptr,
-                          const float *const scale_ptr, const size_t rows, const size_t cols) {
+                          const float *const scale_ptr, const size_t rows, const size_t cols,
+                          const ParamOP p) {
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
   const size_t chunk_offset_Y = blockIdx.y * CHUNK_DIM_Y;
@@ -161,7 +162,6 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     IType *in_gate_sh_curr = in_gate_sh + buff * buff_elems;
     OType *out_act_sh_curr = out_act_sh + buff * buff_elems;
     OType *out_gate_sh_curr = out_gate_sh + buff * buff_elems;
-
 #pragma unroll
     for (int stage = 0; stage < BUFFER_STAGES_NUM; ++stage) {
       const size_t stage_offset_Y = stage * THREADS_PER_CHUNK_Y;
@@ -171,6 +171,12 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
       float act_elt = static_cast<float>(in_act_sh_curr[shmem_idx]);
       float gate_elt = static_cast<float>(in_gate_sh_curr[shmem_idx]);
+      bool dgate_elt = true;  // gating is ideally an identity function
+      if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+        // In case of GPT OSS, clamp the activation and gate values
+        dgate_elt = gate_elt < p.limit && gate_elt > -p.limit;  // Derivative of clamp
+        gate_elt = min(max(-p.limit, gate_elt), p.limit) + 1;
+      }
 
       if constexpr (IS_DGATED) {
         float grad_elt = static_cast<float>(in_grad_sh_curr[shmem_idx]);
@@ -178,18 +184,28 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
         const float x = act_elt;
         float act_x;
         float dact_x;
-
-        if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
-          const float s = sigmoidf(x);
+        if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+          const float x = min(act_elt, p.limit);
+          const float s = sigmoidf(p.alpha * x);
           act_x = x * s;
-          dact_x = x * s * (1 - s) + s;
+          if (act_elt < p.limit) {
+            dact_x = s + s * (1 - s) * p.alpha * x;
+          } else {
+            dact_x = 0.0f;
+          }
         } else {
-          act_x = ActOP(x, {});
-          dact_x = DActOP(x, {});
+          if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
+            const float s = sigmoidf(x);
+            act_x = x * s;
+            dact_x = x * s * (1 - s) + s;
+          } else {
+            act_x = ActOP(x, p);
+            dact_x = DActOP(x, p);
+          }
         }
 
         float after_dact = dact_x * grad_elt * gate_elt;
-        float after_dgate = act_x * grad_elt;
+        float after_dgate = dgate_elt ? act_x * grad_elt : 0.0f;
 
         out_act_sh_curr[shmem_idx] = static_cast<OType>(scale * after_dact);
         out_gate_sh_curr[shmem_idx] = static_cast<OType>(scale * after_dgate);
@@ -197,7 +213,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
         amax = fmaxf(amax, fabsf(after_dact));
         amax = fmaxf(amax, fabsf(after_dgate));
       } else {
-        const float after_act = ActOP(act_elt, {}) * gate_elt;
+        const float after_act = ActOP(act_elt, p) * gate_elt;
         out_act_sh_curr[shmem_idx] = static_cast<OType>(scale * after_act);
         amax = fmaxf(amax, fabsf(after_act));
       }
@@ -300,7 +316,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
                             const __grid_constant__ CUtensorMap tensor_map_output_gate_colwise,
                             e8m0_t *const scales_rowwise, e8m0_t *const scales_colwise,
                             const size_t rows, const size_t cols, const size_t scale_stride_rowwise,
-                            const size_t scale_stride_colwise) {
+                            const size_t scale_stride_colwise, const ParamOP p) {
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   using IType2 = typename ptx::FPx2<IType>;
   using OType2 = typename ptx::FPx2<OType>;
@@ -476,25 +492,37 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
         float gate_elt = static_cast<float>(in_gate_sh[shmem_offset_colwise]);
         float after_act_elt;
         float after_gate_elt;
-
+        bool dgate_elt = true;  // gating is ideally an identity function
+        if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+          // In case of GPT OSS, clamp the activation and gate values
+          dgate_elt = gate_elt < p.limit && gate_elt > -p.limit;  // Derivative of clamp
+          gate_elt = min(max(-p.limit, gate_elt), p.limit) + 1.0f;
+        }
         if constexpr (IS_DGATED) {
           float grad_elt = static_cast<float>(in_grad_sh[shmem_offset_colwise]);
           const float x = act_elt;
           float act_x;
           float dact_x;
-
-          if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
-            const float s = sigmoidf(x);
+          if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+            const float x = min(act_elt, p.limit);
+            const float s = sigmoidf(p.alpha * x);
             act_x = x * s;
-            dact_x = x * s * (1 - s) + s;
+            dact_x = act_elt < p.limit ? s + s * (1 - s) * p.alpha * x : 0.0f;
           } else {
-            act_x = ActOP(x, {});
-            dact_x = DActOP(x, {});
+            if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
+              const float s = sigmoidf(x);
+              act_x = x * s;
+              dact_x = x * s * (1 - s) + s;
+            } else {
+              act_x = ActOP(x, p);
+              dact_x = DActOP(x, p);
+            }
           }
+
           after_act_elt = dact_x * grad_elt * gate_elt;
-          after_gate_elt = act_x * grad_elt;
+          after_gate_elt = dgate_elt ? act_x * grad_elt : 0.0f;
         } else {
-          after_act_elt = ActOP(act_elt, {}) * gate_elt;
+          after_act_elt = ActOP(act_elt, p) * gate_elt;
         }
         // Numerical truncation: Downcast to IType (BF16/FP16), then upcast it back to FP32
         if constexpr (!std::is_same_v<IType, float>) {
@@ -719,27 +747,39 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
             float gate_elt = static_cast<float>(in_gate.data.elt[e]);
             float after_act_elt;
             float after_gate_elt;
-
+            bool dgate_elt = true;
+            if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+              // In case of GPT OSS, clamp the activation and gate values
+              dgate_elt = gate_elt < p.limit && gate_elt > -p.limit;  // Derivative of clamp
+              gate_elt = min(max(-p.limit, gate_elt), p.limit) + 1.0f;
+            }
             if constexpr (IS_DGATED) {
               float grad_elt = static_cast<float>(in_grad.data.elt[e]);
               const float x = act_elt;
               float act_x;
               float dact_x;
-
-              if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
-                const float s = sigmoidf(x);
+              if constexpr (std::is_same<ParamOP, ClampedSwiGLUParam>::value) {
+                const float x = min(act_elt, p.limit);
+                const float s = sigmoidf(p.alpha * x);
                 act_x = x * s;
-                dact_x = x * s * (1 - s) + s;
+                dact_x = act_elt < p.limit ? s + s * (1 - s) * p.alpha * x : 0.0f;
               } else {
-                act_x = ActOP(x, {});
-                dact_x = DActOP(x, {});
+                if constexpr ((ActOP == &silu<fp32, fp32>) && (DActOP == &dsilu<fp32, fp32>)) {
+                  const float s = sigmoidf(x);
+                  act_x = x * s;
+                  dact_x = x * s * (1 - s) + s;
+                } else {
+                  act_x = ActOP(x, p);
+                  dact_x = DActOP(x, p);
+                }
               }
+
               after_act_elt = dact_x * grad_elt * gate_elt;
-              after_gate_elt = act_x * grad_elt;
+              after_gate_elt = dgate_elt ? act_x * grad_elt : 0.0f;
               after_act_rowwise[j] = after_act_elt;
               after_gate_rowwise[j] = after_gate_elt;
             } else {
-              after_act_elt = ActOP(act_elt, {}) * gate_elt;
+              after_act_elt = ActOP(act_elt, p) * gate_elt;
               after_act_rowwise[j] = after_act_elt;
             }
 
@@ -883,7 +923,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
 
 template <bool IS_DGATED, typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &)>
-void cast_fp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output,
+void cast_fp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output, ParamOP p,
                     cudaStream_t stream) {
   checkCuDriverContext(stream);
 
@@ -957,15 +997,14 @@ void cast_fp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *outpu
           cast_fp8_gated_kernel<IS_DGATED, ParamOP, ActOP, DActOP, IType, OType>
           <<<grid_dim, block_dim, shmem_size, stream>>>(
               tensor_map_grad, tensor_map_input_act, tensor_map_input_gate, tensor_map_output_act,
-              tensor_map_output_gate, amax_ptr, scale_inv_ptr, scale_ptr, rows,
-              cols);
+              tensor_map_output_gate, amax_ptr, scale_inv_ptr, scale_ptr, rows, cols, p);
           NVTE_CHECK_CUDA(cudaGetLastError()););  // NOLINT(*)
   );                                              // NOLINT(*)
 }
 
 template <bool IS_DGATED, typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &)>
-void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output,
+void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output, ParamOP p,
                       cudaStream_t stream) {
   checkCuDriverContext(stream);
 
@@ -1096,7 +1135,7 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
                       tensor_map_output_act_rowwise, tensor_map_output_gate_rowwise,
                       tensor_map_output_act_colwise, tensor_map_output_gate_colwise,
                       scales_rowwise_ptr, scales_colwise_ptr, rows, cols, scale_stride_rowwise,
-                      scale_stride_colwise);
+                      scale_stride_colwise, p);
               NVTE_CHECK_CUDA(cudaGetLastError());
               break;
             case ScalingType::COLWISE:
@@ -1113,7 +1152,7 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
                       tensor_map_output_act_rowwise, tensor_map_output_gate_rowwise,
                       tensor_map_output_act_colwise, tensor_map_output_gate_colwise,
                       scales_rowwise_ptr, scales_colwise_ptr, rows, cols, scale_stride_rowwise,
-                      scale_stride_colwise);
+                      scale_stride_colwise, p);
               NVTE_CHECK_CUDA(cudaGetLastError());
               break;
             case ScalingType::BIDIMENSIONAL:
@@ -1130,7 +1169,7 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
                       tensor_map_output_act_rowwise, tensor_map_output_gate_rowwise,
                       tensor_map_output_act_colwise, tensor_map_output_gate_colwise,
                       scales_rowwise_ptr, scales_colwise_ptr, rows, cols, scale_stride_rowwise,
-                      scale_stride_colwise);
+                      scale_stride_colwise, p);
               NVTE_CHECK_CUDA(cudaGetLastError());
               break;
           });  // NOLINT(*)
@@ -1138,12 +1177,8 @@ void cast_mxfp8_gated(const Tensor &grad, const Tensor &gated_input, Tensor *out
 }
 
 template <typename ParamOP, float (*ActOP)(float, const ParamOP &)>
-void cast_gated(const Tensor &input, Tensor *output, cudaStream_t stream) {
-  CheckInputTensor(input, "gated_act_input");
+void cast_gated(const Tensor &input, Tensor *output, ParamOP p, cudaStream_t stream) {
   CheckOutputTensor(*output, "gated_act_output");
-  NVTE_CHECK(output->flat_first_dim() == input.flat_first_dim(),
-             "Wrong output shape. Expected (after flattening) [", input.flat_first_dim(),
-             ", *], got [", output->flat_first_dim(), ", ", output->flat_last_dim(), "].");
   NVTE_CHECK(input.flat_last_dim() % 2 == 0,
              "Wrong input shape. Expected (after flattening) last dimension to be even, ", "got [",
              input.flat_first_dim(), ", ", input.flat_last_dim(), "].");
@@ -1165,7 +1200,7 @@ void cast_gated(const Tensor &input, Tensor *output, cudaStream_t stream) {
                 reinterpret_cast<const fp32 *>(output->scale.dptr),
                 reinterpret_cast<fp32 *>(output->amax.dptr),
                 reinterpret_cast<fp32 *>(output->scale_inv.dptr), input.flat_first_dim(),
-                output->flat_last_dim(), {}, stream);
+                output->flat_last_dim(), p, stream);
           } else {
             NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");
           });  // NOLINT(*)
@@ -1174,7 +1209,8 @@ void cast_gated(const Tensor &input, Tensor *output, cudaStream_t stream) {
 
 template <typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &)>
-void cast_dgated(const Tensor &grad, const Tensor &input, Tensor *output, cudaStream_t stream) {
+void cast_dgated(const Tensor &grad, const Tensor &input, Tensor *output, ParamOP p,
+                 cudaStream_t stream) {
   CheckInputTensor(grad, "dgated_act_grad");
   CheckInputTensor(input, "dgated_act_input");
   CheckOutputTensor(*output, "dgated_act_output");
@@ -1203,7 +1239,7 @@ void cast_dgated(const Tensor &grad, const Tensor &input, Tensor *output, cudaSt
                 reinterpret_cast<const fp32 *>(output->scale.dptr),
                 reinterpret_cast<fp32 *>(output->amax.dptr),
                 reinterpret_cast<fp32 *>(output->scale_inv.dptr), grad.flat_first_dim(),
-                grad.flat_last_dim(), {}, stream);
+                grad.flat_last_dim(), p, stream);
           } else {
             NVTE_ERROR("Not implemented scaling mode: " + to_string(output->scaling_mode) + ".");
           });  // NOLINT(*)
@@ -1212,7 +1248,7 @@ void cast_dgated(const Tensor &grad, const Tensor &input, Tensor *output, cudaSt
 
 template <bool IS_DGATED, typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &)>
-void quantize_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output,
+void quantize_gated(const Tensor &grad, const Tensor &gated_input, Tensor *output, ParamOP p,
                     cudaStream_t stream) {
   constexpr bool allow_empty = false;
   CheckInputTensor(gated_input, "gated_input");
@@ -1252,17 +1288,17 @@ void quantize_gated(const Tensor &grad, const Tensor &gated_input, Tensor *outpu
 
   if (is_delayed_tensor_scaling(output->scaling_mode)) {
     if (use_tma_kernels) {
-      cast_fp8_gated<IS_DGATED, ParamOP, ActOP, DActOP>(grad, gated_input, output, stream);
+      cast_fp8_gated<IS_DGATED, ParamOP, ActOP, DActOP>(grad, gated_input, output, p, stream);
     } else {
       if constexpr (IS_DGATED) {
-        cast_dgated<ParamOP, ActOP, DActOP>(grad, gated_input, output, stream);
+        cast_dgated<ParamOP, ActOP, DActOP>(grad, gated_input, output, p, stream);
       } else {
-        cast_gated<ParamOP, ActOP>(gated_input, output, stream);
+        cast_gated<ParamOP, ActOP>(gated_input, output, p, stream);
       }
     }
   } else if (is_mxfp_scaling(output->scaling_mode)) {
     if (use_tma_kernels) {
-      cast_mxfp8_gated<IS_DGATED, ParamOP, ActOP, DActOP>(grad, gated_input, output, stream);
+      cast_mxfp8_gated<IS_DGATED, ParamOP, ActOP, DActOP>(grad, gated_input, output, p, stream);
     } else {
       NVTE_ERROR("Invalid input shape. Expected the last dimension to be divisible ",
                  "by 32, got input of shape ", gated_input.data.shape);
@@ -1278,7 +1314,7 @@ namespace detail {
 template <bool IS_DGATED, typename ParamOP, float (*ActOP)(float, const ParamOP &),
           float (*DActOP)(float, const ParamOP &)>
 void quantize_gated_helper(const NVTETensor grad, const NVTETensor gated_input, NVTETensor output,
-                           cudaStream_t stream) {
+                           ParamOP p, cudaStream_t stream) {
   using namespace gated_kernels;
   Tensor grad_empty_tensor;
   const Tensor &grad_tensor = IS_DGATED ? *(convertNVTETensorCheck(grad)) : grad_empty_tensor;
@@ -1287,13 +1323,14 @@ void quantize_gated_helper(const NVTETensor grad, const NVTETensor gated_input,
 
   if (is_supported_by_CC_100()) {
     quantize_gated<IS_DGATED, ParamOP, ActOP, DActOP>(grad_tensor, gated_input_tensor,
-                                                      output_tensor, stream);
+                                                      output_tensor, p, stream);
   } else {
     if (is_delayed_tensor_scaling(output_tensor->scaling_mode)) {
       if constexpr (IS_DGATED) {
-        cast_dgated<ParamOP, ActOP, DActOP>(grad_tensor, gated_input_tensor, output_tensor, stream);
+        cast_dgated<ParamOP, ActOP, DActOP>(grad_tensor, gated_input_tensor, output_tensor, p,
+                                            stream);
       } else {
-        cast_gated<ParamOP, ActOP>(gated_input_tensor, output_tensor, stream);
+        cast_gated<ParamOP, ActOP>(gated_input_tensor, output_tensor, p, stream);
       }
     } else {
       // MX scaling
diff --git a/transformer_engine/common/util/math.h b/transformer_engine/common/util/math.h
index 2d425d6753..2f20817fb0 100644
--- a/transformer_engine/common/util/math.h
+++ b/transformer_engine/common/util/math.h
@@ -11,6 +11,11 @@ namespace transformer_engine {
 
 struct Empty {};
 
+struct ClampedSwiGLUParam {
+  float limit;
+  float alpha = 1.702f;  // Default value for QuickGELU
+};
+
 template <typename OType, typename IType>
 __device__ inline OType gelu(const IType val, const Empty&) {
   const float cval = val;
@@ -38,17 +43,29 @@ __device__ inline OType dsigmoid(const IType val, const Empty& e) {
   return s * (1.f - s);
 }
 
+template <typename OType, typename IType>
+__device__ inline OType qgelu_with_alpha(const IType val, const float alpha) {
+  const float cval = val;
+  Empty e = {};
+  return cval * sigmoid<float, float>(alpha * cval, e);
+}
+
 template <typename OType, typename IType>
 __device__ inline OType qgelu(const IType val, const Empty& e) {
+  return qgelu_with_alpha<OType, IType>(val, 1.702f);
+}
+
+template <typename OType, typename IType>
+__device__ inline OType dqgelu_with_alpha(const IType val, const float alpha) {
   const float cval = val;
-  return cval * sigmoid<float, float>(1.702f * cval, e);
+  Empty e = {};
+  return alpha * cval * dsigmoid<float, float>(alpha * cval, e) +
+         sigmoid<float, float>(alpha * cval, e);
 }
 
 template <typename OType, typename IType>
 __device__ inline OType dqgelu(const IType val, const Empty& e) {
-  const float cval = val;
-  return 1.702f * cval * dsigmoid<float, float>(1.702f * cval, e) +
-         sigmoid<float, float>(1.702f * cval, e);
+  return dqgelu_with_alpha<OType, IType>(val, 1.702f);
 }
 
 template <typename OType, typename IType>
@@ -57,12 +74,26 @@ __device__ inline OType silu(const IType val, const Empty& e) {
   return cval * sigmoid<float, float>(cval, e);
 }
 
+template <typename OType, typename IType>
+__device__ inline OType clamped_silu(const IType val, const ClampedSwiGLUParam& p) {
+  const float cval = min(p.limit, static_cast<float>(val));  // Clamping
+  return qgelu_with_alpha<OType, float>(cval, p.alpha);
+}
+
 template <typename OType, typename IType>
 __device__ inline OType dsilu(const IType val, const Empty& e) {
   const float cval = val;
   return cval * dsigmoid<float, float>(cval, e) + sigmoid<float, float>(cval, e);
 }
 
+template <typename OType, typename IType>
+__device__ inline OType clamped_dsilu(const IType val, const ClampedSwiGLUParam& p) {
+  const bool dclamp_val = static_cast<float>(val) <= p.limit;
+  const float clamp_val = min(static_cast<float>(val), p.limit);
+  const float dsilu_val = dqgelu_with_alpha<OType, float>(clamp_val, p.alpha);
+  return dclamp_val ? dsilu_val : 0.0f;
+}
+
 template <typename OType, typename IType>
 __device__ inline OType relu(IType value, const Empty&) {
   return fmaxf(value, 0.f);
diff --git a/transformer_engine/common/util/vectorized_pointwise.h b/transformer_engine/common/util/vectorized_pointwise.h
index 0d667a0ece..4ad1c16de8 100644
--- a/transformer_engine/common/util/vectorized_pointwise.h
+++ b/transformer_engine/common/util/vectorized_pointwise.h
@@ -11,7 +11,7 @@
 
 #include "../common.h"
 #include "../utils.cuh"
-
+#include "math.h"
 namespace transformer_engine {
 
 /* \brief Helper class that enables storing multiple values of type DType
@@ -338,7 +338,7 @@ template <int nvec, typename Param, fp32 (*OP)(const fp32, const Param &), typen
           typename OutputType>
 void VectorizedUnaryKernelLauncher(const InputType *input, const fp32 *noop, OutputType *output,
                                    const fp32 *scale, fp32 *amax, fp32 *scale_inv, const size_t N,
-                                   const Param params, cudaStream_t stream) {
+                                   const Param &params, cudaStream_t stream) {
   if (N != 0) {
     auto align = CheckAlignment(N, nvec, input, output);
 
@@ -431,7 +431,13 @@ __launch_bounds__(unary_kernel_threads) __global__
 #pragma unroll
     for (int i = 0; i < nvec; ++i) {
       const ComputeType val = static_cast<ComputeType>(loader0.separate()[i]);
-      const ComputeType val2 = static_cast<ComputeType>(loader1.separate()[i]);
+      ComputeType val2 = static_cast<ComputeType>(loader1.separate()[i]);
+
+      if constexpr (std::is_same<Param, ClampedSwiGLUParam>::value) {
+        // Clamp the gated value and add 1 at the end
+        ComputeType limit = p.limit;
+        val2 = std::min(std::max(-limit, val2), limit) + 1;
+      }
       ComputeType temp = static_cast<ComputeType>(Activation(val, p) * val2);
       if (requires_amax) {
         __builtin_assume(max >= 0);
@@ -532,10 +538,18 @@ __launch_bounds__(unary_kernel_threads) __global__
     for (int i = 0; i < nvec; ++i) {
       const ComputeType grad_val = static_cast<ComputeType>(grad_loader.separate()[i]);
       const ComputeType gelu_in = static_cast<ComputeType>(input_loader0.separate()[i]);
-      const ComputeType gate_in = static_cast<ComputeType>(input_loader1.separate()[i]);
+      ComputeType gate_in = static_cast<ComputeType>(input_loader1.separate()[i]);
+      bool dgate_in = true;
+
+      if constexpr (std::is_same<Param, ClampedSwiGLUParam>::value) {
+        // In case of GPT OSS, clamp the activation and gate values
+        const ComputeType limit = p.limit;
+        dgate_in = gate_in < limit && gate_in > -limit;  // Derivative of clamp
+        gate_in = std::min(std::max(-limit, gate_in), limit) + 1.0f;
+      }
 
       ComputeType after_dgelu = Dactivation(gelu_in, p) * grad_val * gate_in;
-      ComputeType after_dgate = grad_val * Activation(gelu_in, p);
+      ComputeType after_dgate = dgate_in ? grad_val * Activation(gelu_in, p) : 0.0f;
 
       if (requires_amax) {
         __builtin_assume(max >= 0);
diff --git a/transformer_engine/jax/activation.py b/transformer_engine/jax/activation.py
index 12b35ec43c..daa3679c48 100644
--- a/transformer_engine/jax/activation.py
+++ b/transformer_engine/jax/activation.py
@@ -11,7 +11,6 @@
 
 import jax
 import jax.numpy as jnp
-
 from . import cpp_extensions as tex
 
 from .quantize.tensor import NoScaleTensor
@@ -22,6 +21,7 @@ def activation(
     x: jnp.ndarray,
     activation_type: Sequence[Union[str, Callable]],
     quantizer: Optional[Quantizer] = None,
+    act_params: Optional[tex.activation.ActivationParams] = None,
 ) -> jnp.ndarray:
     """Apply activation functions to input tensor with optional quantization.
 
@@ -32,17 +32,19 @@ def activation(
         x: Input tensor to apply activations to
         activation_type: Sequence of activation functions
         quantizer: Optional quantizer for quantizing the output
+        act_params: Optional activation parameters. Currently used
+        just for ClampedSwiGLU.
 
     Returns:
         Activated output tensor
     """
     assert x.shape[-1] % len(activation_type) == 0
-    output = _activation(x, activation_type, quantizer)
+    output = _activation(x, activation_type, quantizer, act_params)
     return output
 
 
-@partial(jax.custom_vjp, nondiff_argnums=(1,))
-def _activation(x, activation_type, quantizer):
+@partial(jax.custom_vjp, nondiff_argnums=(1, 3))
+def _activation(x, activation_type, quantizer, act_params):
     """Internal implementation of activation with custom VJP.
 
     This function implements the core activation logic with support for
@@ -52,36 +54,42 @@ def _activation(x, activation_type, quantizer):
         x: Input tensor
         activation_type: Sequence of activation functions
         quantizer: Optional quantizer
+        act_params: Optional activation parameters. Currently used
+        just for ClampedSwiGLU.
 
     Returns:
         Activated tensor
     """
-    _output, _ = _activation_fwd_rule(x, activation_type, quantizer)
+    _output, _ = _activation_fwd_rule(x, activation_type, quantizer, act_params)
     return _output
 
 
-def _activation_fwd_rule(x, activation_type, quantizer):
+def _activation_fwd_rule(x, activation_type, quantizer, act_params):
     """Forward pass rule for activation function.
 
     Args:
         x: Input tensor
         activation_type: Sequence of activation functions
         quantizer: Optional quantizer
+        act_params: Optional activation parameters. Currently used
+        just for ClampedSwiGLU.
 
     Returns:
         Tuple of (output, context) for backward pass
     """
-    fwd_output = tex.act_lu(x, activation_type, quantizer)
+    fwd_output = tex.act_lu(x, activation_type, quantizer, act_params)
     # This is a no-op for higher-precision tensors
     fwd_output = fwd_output.dequantize()
     return fwd_output, (x, quantizer)
 
 
-def _activation_bwd_rule(activation_type, ctx, g):
+def _activation_bwd_rule(activation_type, act_params, ctx, g):
     """Backward pass rule for activation function.
 
     Args:
         activation_type: Sequence of activation functions
+        act_params: Optional activation parameters. Currently used
+        just for ClampedSwiGLU.
         ctx: Context from forward pass
         g: Gradient from upstream
 
@@ -90,7 +98,7 @@ def _activation_bwd_rule(activation_type, ctx, g):
     """
     (x, _) = ctx
     assert x.dtype == g.dtype
-    dx = tex.dact_lu(g, x, activation_type)
+    dx = tex.dact_lu(g, x, activation_type, act_params=act_params)
     # No quantization is used in this VJP backward, so the output should
     # always be a NoScaleTensor
     assert isinstance(dx, NoScaleTensor)
diff --git a/transformer_engine/jax/cpp_extensions/activation.py b/transformer_engine/jax/cpp_extensions/activation.py
index a8c14a6087..925c1d01ae 100644
--- a/transformer_engine/jax/cpp_extensions/activation.py
+++ b/transformer_engine/jax/cpp_extensions/activation.py
@@ -5,6 +5,7 @@
 from typing import Sequence, Union, Callable, Optional, Tuple
 import operator
 from functools import reduce, partial
+from dataclasses import dataclass
 
 import jax
 import jax.numpy as jnp
@@ -12,9 +13,9 @@
 from jax.experimental.custom_partitioning import SdyShardingRule
 from jax.sharding import PartitionSpec
 
+import numpy as np
 import transformer_engine_jax
 from transformer_engine_jax import NVTE_Activation_Type
-
 from .base import BasePrimitive, register_primitive
 from .misc import (
     jax_dtype_to_te_dtype,
@@ -51,17 +52,87 @@
     ("quick_gelu", "linear"): NVTE_Activation_Type.QGEGLU,
     ("squared_relu",): NVTE_Activation_Type.SRELU,
     ("squared_relu", "linear"): NVTE_Activation_Type.SREGLU,
+    ("clamped_silu", "clamped_linear"): NVTE_Activation_Type.CLAMPED_SWIGLU,
 }
 
 
-def _convert_to_activation_function(fn_or_string):
+@dataclass(frozen=True)
+class ClampedSwigluParams:
+    """Parameters for the Clamped SwiGLU activation function
+    used in GPT OSS."""
+
+    limit: float = 7.0
+    alpha: float = 1.702
+
+    def __hash__(self):
+        """Custom hash function to ensure dataclass is hashable for jax jit to work.
+
+        Returns:
+            int: Hash value of the dataclass instance.
+        """
+        return hash((self.limit, self.alpha))
+
+    def to_ffi_lowering_dict(self):
+        """Convert the activation parameters to a dictionary format for FFI lowering.
+
+        Returns:
+            dict: A dictionary representation of the activation parameters consumable by
+            XLA FFI bindings for activation functions.
+        """
+        return {"limit": np.float32(self.limit), "alpha": np.float32(self.alpha)}
+
+
+@dataclass(frozen=True)
+class ActivationParams:
+    """Parameters for various activation functions.
+    Currently only Clamped SwiGLU activation has parameters.
+    """
+
+    clamped_swiglu: ClampedSwigluParams = ClampedSwigluParams()
+
+    @staticmethod
+    def create(activation_type, **kwargs):
+        """Factory method to create ActivationParams based on activation_type."""
+        CLAMPED_ACTIVATION_TYPES = {
+            ("clamped_silu", "clamped_linear"),
+            "clamped_silu",
+            "clamped_linear",
+        }
+        if activation_type in CLAMPED_ACTIVATION_TYPES:
+            return ActivationParams(ClampedSwigluParams(**kwargs))
+        return ActivationParams()  # Default params for activations without parameters
+
+    def __hash__(self):
+        """Custom hash function to ensure dataclass is hashable for jax jit to work"""
+        return hash((self.clamped_swiglu,))
+
+    def to_ffi_lowering_dict(self):
+        """Convert the activation parameters to a dictionary format for FFI lowering.
+        Returns:
+            dict: A dictionary representation of the activation parameters consumable by
+            XLA FFI bindings for activation functions.
+        """
+        return {"clamped_swiglu": self.clamped_swiglu.to_ffi_lowering_dict()}
+
+
+def _convert_to_activation_function(fn_or_string, act_params: ActivationParams):
     """Convert a string to an activation function."""
     if fn_or_string == "linear":
         return lambda x: x
+    if fn_or_string == "clamped_linear":
+        # This function is used for ClampedSwiGLU
+        # used in GPT OSS where the gates are not only clamped
+        # but also shifted by +1
+        limit = act_params.clamped_swiglu.limit
+        return lambda x: jnp.clip(x, min=-limit, max=limit) + 1
     if fn_or_string == "quick_gelu":
         return lambda x: jax.nn.sigmoid(1.702 * x) * x
     if fn_or_string == "squared_relu":
         return lambda x: reduce(operator.mul, [jax.nn.relu(x), jax.nn.relu(x)])
+    if fn_or_string == "clamped_silu":
+        limit = act_params.clamped_swiglu.limit
+        alpha = act_params.clamped_swiglu.alpha
+        return lambda x: jax.nn.sigmoid(alpha * jnp.minimum(x, limit)) * jnp.minimum(x, limit)
     if isinstance(fn_or_string, str):
         return getattr(jax.nn, fn_or_string)
     if callable(fn_or_string):
@@ -84,7 +155,8 @@ class ActLuPrimitive(BasePrimitive):
         6,
         7,
         8,
-    )  # out_dtype, act_enum, act_len, scaling_mode, is_2x, scale_dtype, is_outer
+        9,
+    )  # out_dtype, act_enum, act_len, scaling_mode, is_2x, scale_dtype, is_outer, act_params
     inner_primitive = None
     outer_primitive = None
 
@@ -100,11 +172,12 @@ def abstract(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
     ):
         """
         te_act_lu_p abstract
         """
-        del act_enum
+        del act_enum, act_params
         dtype = dtypes.canonicalize_dtype(x_aval.dtype)
         assert dtype in [jnp.float32, jnp.float16, jnp.bfloat16]
         assert scale_aval is None or scale_aval.dtype == jnp.float32
@@ -150,6 +223,7 @@ def lowering(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
     ):
         """
         te_gated_act_lu_p lowering rules
@@ -158,9 +232,14 @@ def lowering(
         x_aval, scale_aval = ctx.avals_in
         assert x_aval.dtype in [jnp.float32, jnp.float16, jnp.bfloat16]
         assert scale_aval is None or scale_aval.dtype == jnp.float32
-
         out = ffi.ffi_lowering(ActLuPrimitive.name)(
-            ctx, x, scale, act_enum=act_enum, scaling_mode=scaling_mode.value, is_2x=is_2x
+            ctx,
+            x,
+            scale,
+            act_enum=act_enum,
+            scaling_mode=scaling_mode.value,
+            is_2x=is_2x,
+            act_params=act_params.to_ffi_lowering_dict(),
         )
         return out
 
@@ -175,6 +254,7 @@ def impl(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
     ):
         """
         to describe implementation
@@ -193,6 +273,7 @@ def impl(
                 is_2x=is_2x,
                 scale_dtype=scale_dtype,
                 is_outer=False,
+                act_params=act_params,
             )
         )
         rowwise_scale_inv_shape, colwise_scale_inv_shape = ScalingMode(
@@ -221,6 +302,7 @@ def batcher(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
     ):
         """
         to describe batch rules for vmap
@@ -242,6 +324,7 @@ def batcher(
                 scaling_mode=scaling_mode,
                 is_2x=is_2x,
                 scale_dtype=scale_dtype,
+                act_params=act_params,
             ),
             out_bdims,
         )
@@ -255,6 +338,7 @@ def infer_sharding_from_operands(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
         mesh,
         arg_infos,
         result_infos,
@@ -266,6 +350,7 @@ def infer_sharding_from_operands(
             scale_dtype,
             act_len,
             is_outer,
+            act_params,
         )  # Unused.
         x_spec = get_padded_spec(arg_infos[0])
         scale_spec = get_padded_spec(arg_infos[1])
@@ -318,6 +403,7 @@ def partition(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
         mesh,
         arg_infos,
         result_infos,
@@ -378,6 +464,7 @@ def sharded_impl(x, scale):
                     is_2x=is_2x,
                     scale_dtype=scale_dtype,
                     is_outer=True,
+                    act_params=act_params,
                 )
             )
 
@@ -405,11 +492,12 @@ def shardy_sharding_rule(
         is_2x,
         scale_dtype,
         is_outer,
+        act_params,
         mesh,
         value_types,
         result_types,
     ):
-        del out_dtype, act_enum, act_len, scale_dtype, is_outer, mesh, result_types
+        del out_dtype, act_enum, act_len, scale_dtype, is_outer, mesh, result_types, act_params
         prefix = "ActLu_"
         input_shape = value_types[0].shape
         output_shape = input_shape[:-2] + input_shape[-1:]
@@ -455,8 +543,8 @@ class BaseDActLuDBiasQuantizePrimitive(BasePrimitive):
 
     name = "te_dact_dbias_quantize_ffi"
     multiple_results = True
-    # out_dtype, scaling_mode, is_2x, scale_dtype, is_dbias, act_enum, act_len, is_outer
-    impl_static_args = (3, 4, 5, 6, 7, 8, 9, 10)
+    # out_dtype, scaling_mode, is_2x, scale_dtype, is_dbias, act_enum, act_len, is_outer, act_params
+    impl_static_args = (3, 4, 5, 6, 7, 8, 9, 10, 11)
     inner_primitive = None
     outer_primitive = None
 
@@ -474,11 +562,12 @@ def abstract(
         act_enum,
         act_len,
         is_outer,
+        act_params,
     ):
         """
         te_dact_dbias_quantize_p abstract
         """
-        del act_enum
+        del act_enum, act_params
         dz_dtype = dtypes.canonicalize_dtype(dz_aval.dtype)
         assert dz_dtype in [jnp.float32, jnp.float16, jnp.bfloat16]
         assert x_aval.dtype == dz_dtype
@@ -575,6 +664,7 @@ def lowering(
         act_enum,
         act_len,
         is_outer,
+        act_params,
     ):
         """
         te_dact_dbias_quantize_p lowering rules
@@ -593,6 +683,7 @@ def lowering(
             is_2x=is_2x,
             is_dbias=is_dbias,
             act_enum=int(act_enum),
+            act_params=act_params.to_ffi_lowering_dict(),
         )
 
     @staticmethod
@@ -608,6 +699,7 @@ def impl(
         act_enum,
         act_len,
         is_outer,
+        act_params,
     ):
         """
         te_dact_dbias_quantize_p impl
@@ -627,6 +719,7 @@ def impl(
                 act_enum=act_enum,
                 act_len=act_len,
                 is_outer=False,
+                act_params=act_params,
             )
         )
         rowwise_scale_inv_shape, colwise_scale_inv_shape = ScalingMode(
@@ -655,6 +748,7 @@ def batcher(
         act_enum,
         act_len,
         is_outer,
+        act_params,
     ):
         """
         to describe batch rules for vmap
@@ -685,6 +779,7 @@ def batcher(
                 is_dbias=is_dbias,
                 act_enum=act_enum,
                 act_len=act_len,
+                act_params=act_params,
             ),
             out_bdims,
         )
@@ -699,11 +794,12 @@ def infer_sharding_from_operands(
         act_enum,
         act_len,
         is_outer,
+        act_params,
         mesh,
         arg_infos,
         result_infos,
     ):
-        del out_dtype, result_infos, act_enum
+        del out_dtype, result_infos, act_enum, act_params
         del scale_dtype, act_len, is_outer
         x_spec = get_padded_spec(arg_infos[1])
         scale_spec = get_padded_spec(arg_infos[2])
@@ -774,6 +870,7 @@ def partition(
         act_enum,
         act_len,
         is_outer,
+        act_params,
         mesh,
         arg_infos,
         result_infos,
@@ -854,6 +951,7 @@ def sharded_impl(dz, x, scale):
                     act_enum=act_enum,
                     act_len=act_len,
                     is_outer=True,
+                    act_params=act_params,
                 )
             )
             if is_dbias:
@@ -880,11 +978,13 @@ def shardy_sharding_rule(
         act_enum,
         act_len,
         is_outer,
+        act_params,
         mesh,
         value_types,
         result_types,
     ):
-        del out_dtype, scale_dtype, act_enum, act_len, is_outer, mesh, result_types
+
+        del out_dtype, scale_dtype, act_enum, act_len, is_outer, mesh, result_types, act_params
         prefix = "DActLuDBias_"
         scale_rules = ScalingMode(scaling_mode).get_shardy_sharding_rules(
             value_types[1].shape, unique_var=prefix + "x", flatten_axis=-2
@@ -923,20 +1023,22 @@ class DActLuQuantizePrimitive(BaseDActLuDBiasQuantizePrimitive):
     """Subclass of BaseDActLuDBiasQuantizePrimitive for fused activation quantization without dbias. No change in functionality from the base primitive but named differently for use in more granular disabling of primitives via NVTE_JAX_CUSTOM_CALLS."""
 
 
-def _jax_act_lu(inputs, activation_type, quantizer=None) -> Union[NoScaleTensor, ScaledTensor]:
+def _jax_act_lu(
+    inputs, activation_type, quantizer=None, act_params: Optional[ActivationParams] = None
+) -> Union[NoScaleTensor, ScaledTensor]:
     """
     JAX native activation implementation
     """
+    act_params = act_params if act_params is not None else ActivationParams()
     act_len = len(activation_type)
     assert inputs.shape[-2] == act_len, (
         "activation input should be replicated by act_len in the -2 axis, got input shape"
         f" {inputs.shape} and act_len {act_len}"
     )
-
     x = jnp.split(inputs, act_len, axis=-2)
     acts = []
     for idx, act_fn in enumerate(activation_type):
-        x_i = _convert_to_activation_function(act_fn)(x[idx])
+        x_i = _convert_to_activation_function(act_fn, act_params)(x[idx])
         acts.append(x_i)
     x = reduce(operator.mul, acts)
     x = jnp.squeeze(x, axis=-2)
@@ -951,10 +1053,12 @@ def _jax_quantize_dact_dbias(
     activation_type: Sequence[Union[str, Callable]],
     is_dbias: bool = True,
     quantizer: Optional[Quantizer] = None,
+    act_params: Optional[ActivationParams] = None,
 ):
     """
     JAX implementation of dact_lu and dbias with optional quantization
     """
+    act_params = act_params if act_params is not None else ActivationParams()
     act_len = len(activation_type)
     assert x.shape[-2] == act_len, (
         "activation input should be replicated by act_len in the -2 axis, got input shape"
@@ -962,7 +1066,8 @@ def _jax_quantize_dact_dbias(
     )
 
     _, vjp_func = jax.vjp(
-        partial(_jax_act_lu, activation_type=activation_type), x.astype(jnp.float32)
+        partial(_jax_act_lu, activation_type=activation_type, act_params=act_params),
+        x.astype(jnp.float32),
     )
     # VJP is using non-quantized backward for dact, so the input should always be wrapped in NoScaleTensor regardless of whether the forward pass used quantization or this dact will quantize afterwards.
     dz = NoScaleTensor(data=dz.astype(jnp.float32), amax=None)
@@ -985,6 +1090,7 @@ def act_lu(
     x: jnp.ndarray,
     activation_type: Sequence[Union[str, Callable]],
     quantizer: Optional[Quantizer] = None,
+    act_params: Optional[ActivationParams] = None,
     amax_scope: AmaxScope = AmaxScope.LOCAL,
 ) -> Union[jnp.ndarray, ScaledTensor]:
     """Activation with optional quantization.
@@ -1008,24 +1114,22 @@ def act_lu(
         "activation input should be replicated by act_len in the -2 axis, got input shape"
         f" {x.shape} and act_len {act_len}"
     )
-
+    act_params = act_params if act_params is not None else ActivationParams()
     if not ActLuPrimitive.enabled():
-        return _jax_act_lu(x, activation_type, quantizer)
+        return _jax_act_lu(x, activation_type, quantizer, act_params)
 
     # TE/common does not support colwise-only quantization yet
     if quantizer is not None and quantizer.q_layout == QuantizeLayout.COLWISE:
-        return _jax_act_lu(x, activation_type, quantizer)
-
+        return _jax_act_lu(x, activation_type, quantizer, act_params)
     # TE/common does not support 2x quantization for DelayedScaling yet
     war_output = try_apply_delayed_scaling_2x_war(
-        f=act_lu, x=x, activation_type=activation_type, quantizer=quantizer
+        f=act_lu, x=x, activation_type=activation_type, quantizer=quantizer, act_params=act_params
     )
     if war_output is not None:
         return war_output
 
     scale = jnp.empty((1,), jnp.float32)
     output_shape = (*x.shape[:-2], x.shape[-1])
-
     if quantizer is None:
         out, _, _, _, _ = ActLuPrimitive.outer_primitive.bind(
             x,
@@ -1037,6 +1141,7 @@ def act_lu(
             is_2x=False,
             scale_dtype=jnp.float32,
             is_outer=True,
+            act_params=act_params,
         )
         out = out.reshape(output_shape)
         out = NoScaleTensor(
@@ -1051,6 +1156,7 @@ def act_lu(
             x=x,
             activation_type=activation_type,
             quantizer=None,
+            act_params=act_params,
         )
         out, _ = _quantize_dbias_impl(
             out,
@@ -1060,7 +1166,6 @@ def act_lu(
             amax_scope=amax_scope,
         )
         return out
-
     if isinstance(quantizer, DelayedScaleQuantizer):
         scale = quantizer.scale
 
@@ -1080,6 +1185,7 @@ def act_lu(
         is_2x=quantizer.is_2x2x(),
         scale_dtype=quantizer.get_scale_dtype(),
         is_outer=True,
+        act_params=act_params,
     )
 
     quantizer.update(updated_amax)
@@ -1102,6 +1208,7 @@ def quantize_dact_dbias(
     activation_type: Sequence[Union[str, Callable]] = ("gelu",),
     is_dbias: bool = True,
     quantizer: Optional[Quantizer] = None,
+    act_params: Optional[ActivationParams] = None,
 ) -> Tuple[ScaledTensor, jnp.ndarray]:
     """Compute gradients of activation and bias with optional quantization.
 
@@ -1118,7 +1225,7 @@ def quantize_dact_dbias(
         - The gradient of the activation with respect to the input.
         - The gradient of the activation with respect to the bias.
     """
-
+    act_params = act_params if act_params is not None else ActivationParams()
     act_len = len(activation_type)
     assert x.shape[-2] == act_len, (
         "activation input should be replicated by act_len in the -2 axis, got input shape"
@@ -1131,8 +1238,7 @@ def quantize_dact_dbias(
     if not PrimitiveClass.enabled() or (
         quantizer is not None and quantizer.q_layout == QuantizeLayout.COLWISE
     ):
-        return _jax_quantize_dact_dbias(dz, x, activation_type, is_dbias, quantizer)
-
+        return _jax_quantize_dact_dbias(dz, x, activation_type, is_dbias, quantizer, act_params)
     if quantizer is None:
         output, _, _, _, _, _ = PrimitiveClass.outer_primitive.bind(
             dz,
@@ -1148,6 +1254,7 @@ def quantize_dact_dbias(
             act_enum=act_type_id,
             act_len=act_len,
             is_outer=True,
+            act_params=act_params,
         )
         output = output.astype(x.dtype)
         dbias = None
@@ -1163,7 +1270,11 @@ def quantize_dact_dbias(
     # TE/common does not support 1x dact_dbias_quantize on arch < 100 yet
     if should_apply_1x_fused_dbias_war_for_arch_l_100(is_dbias=is_dbias, quantizer=quantizer):
         out = dact_lu(
-            dz.astype(jnp.float32), x.astype(jnp.float32), activation_type, quantizer=None
+            dz.astype(jnp.float32),
+            x.astype(jnp.float32),
+            activation_type,
+            quantizer=None,
+            act_params=act_params,
         )
         return _quantize_dbias_impl(
             out.data, quantizer, is_dbias=True, dq_dtype=x.dtype, flatten_axis=-2
@@ -1180,6 +1291,7 @@ def quantize_dact_dbias(
             is_dbias=is_dbias,
             quantizer=quantizer,
             flatten_axis=-2,
+            act_params=act_params,
         )
         if war_output is not None:
             return war_output
@@ -1191,6 +1303,7 @@ def quantize_dact_dbias(
             x=x,
             activation_type=activation_type,
             quantizer=None,
+            act_params=act_params,
         )
         out, dbias = _quantize_dbias_impl(
             out.data, is_dbias=is_dbias, quantizer=quantizer, dq_dtype=x.dtype, flatten_axis=-2
@@ -1203,7 +1316,10 @@ def quantize_dact_dbias(
     # TE/common dact_dbias_quantize does not support gated act yet
     if is_dbias and is_gated:
         dgated = dact_lu(
-            dz.astype(jnp.float32), x.astype(jnp.float32), activation_type=activation_type
+            dz.astype(jnp.float32),
+            x.astype(jnp.float32),
+            activation_type=activation_type,
+            act_params=act_params,
         )
         out, dbias = _quantize_dbias_impl(
             dgated, quantizer, is_dbias=True, dq_dtype=x.dtype, flatten_axis=-2
@@ -1229,6 +1345,7 @@ def quantize_dact_dbias(
         act_enum=act_type_id,
         act_len=act_len,
         is_outer=True,
+        act_params=act_params,
     )
 
     # For DelayedScaling transpose, the scale buffer is shared for both rowwise and colwise
@@ -1257,6 +1374,7 @@ def dact_lu(
     x: jnp.ndarray,
     activation_type: Sequence[Union[str, Callable]],
     quantizer: Optional[Quantizer] = None,
+    act_params: Optional[ActivationParams] = None,
 ) -> Union[jnp.ndarray, ScaledTensor]:
     """
     Backward pass for activation with optional quantization.
@@ -1270,11 +1388,13 @@ def dact_lu(
     Returns:
         The gradient of the activation with respect to the input.
     """
+    act_params = act_params if act_params is not None else ActivationParams()
     output, _ = quantize_dact_dbias(
         dz=dz,
         x=x,
         activation_type=activation_type,
         is_dbias=False,
         quantizer=quantizer,
+        act_params=act_params,
     )
     return output
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 59079fe3f0..1a55cc52cd 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -38,6 +38,15 @@ XLA_FFI_REGISTER_ENUM_ATTR_DECODING(transformer_engine::jax::JAXX_Scaling_Mode);
 namespace transformer_engine {
 namespace jax {
 
+struct ClampedSwigluConfig {
+  float limit;
+  float alpha;
+};
+
+struct ActivationConfig {
+  ClampedSwigluConfig clamped_swiglu_config;
+};
+
 inline bool use_fp8(DType type) { return type == DType::kFloat8E4M3 || type == DType::kFloat8E5M2; }
 
 // Activation
@@ -134,4 +143,11 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(CublasHandleInitHandler);
 }  // namespace jax
 }  // namespace transformer_engine
 
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(transformer_engine::jax::ClampedSwigluConfig,
+                                      ::xla::ffi::StructMember<float>("limit"),
+                                      ::xla::ffi::StructMember<float>("alpha"));
+
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
+    transformer_engine::jax::ActivationConfig,
+    ::xla::ffi::StructMember<transformer_engine::jax::ClampedSwigluConfig>("clamped_swiglu"));
 #endif  // TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_
diff --git a/transformer_engine/jax/csrc/extensions/activation.cpp b/transformer_engine/jax/csrc/extensions/activation.cpp
index 17fa9906bb..7e7e3178b4 100644
--- a/transformer_engine/jax/csrc/extensions/activation.cpp
+++ b/transformer_engine/jax/csrc/extensions/activation.cpp
@@ -18,7 +18,10 @@ Error_Type ActLuFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type scal
                     Result_Type output_buf, Result_Type colwise_output_buf,
                     Result_Type scale_inv_buf, Result_Type colwise_scale_inv_buf,
                     Result_Type amax_buf, int64_t act_enum, JAXX_Scaling_Mode scaling_mode,
-                    bool is_2x_int) {
+                    bool is_2x_int, ActivationConfig act_params) {
+  // parameters for clamped swiglu used in GPT OSS
+  auto swiglu_limit = act_params.clamped_swiglu_config.limit;
+  auto swiglu_alpha = act_params.clamped_swiglu_config.alpha;
   auto in_dtype = convert_ffi_datatype_to_te_dtype(input_buf.element_type());
   auto out_dtype = convert_ffi_datatype_to_te_dtype(output_buf->element_type());
 
@@ -125,6 +128,10 @@ Error_Type ActLuFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type scal
     case NVTE_Activation_Type::SREGLU:
       nvte_sreglu(input_tensor.data(), output_tensor.data(), stream);
       break;
+    case NVTE_Activation_Type::CLAMPED_SWIGLU:
+      nvte_clamped_swiglu(input_tensor.data(), output_tensor.data(), swiglu_limit, swiglu_alpha,
+                          stream);
+      break;
     default:
       NVTE_ERROR("Unsupported ActivationEnum");
       break;
@@ -133,20 +140,23 @@ Error_Type ActLuFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type scal
   return ffi_with_cuda_error_check();
 }
 
-XLA_FFI_DEFINE_HANDLER_SYMBOL(ActLuHandler, ActLuFFI,
-                              FFI::Bind()
-                                  .Ctx<FFI_Stream_Type>()  // stream
-                                  .Arg<Buffer_Type>()      // input
-                                  .Arg<Buffer_Type>()      // scale
-                                  .Ret<Buffer_Type>()      // output
-                                  .Ret<Buffer_Type>()      // colwise output
-                                  .Ret<Buffer_Type>()      // scale_inv
-                                  .Ret<Buffer_Type>()      // scale_inv colwise
-                                  .Ret<Buffer_Type>()      // amax
-                                  .Attr<int64_t>("act_enum")
-                                  .Attr<JAXX_Scaling_Mode>("scaling_mode")
-                                  .Attr<bool>("is_2x"),
-                              FFI_CudaGraph_Traits);
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    ActLuHandler, ActLuFFI,
+    FFI::Bind()
+        .Ctx<FFI_Stream_Type>()  // stream
+        .Arg<Buffer_Type>()      // input
+        .Arg<Buffer_Type>()      // scale
+        .Ret<Buffer_Type>()      // output
+        .Ret<Buffer_Type>()      // colwise output
+        .Ret<Buffer_Type>()      // scale_inv
+        .Ret<Buffer_Type>()      // scale_inv colwise
+        .Ret<Buffer_Type>()      // amax
+        .Attr<int64_t>("act_enum")
+        .Attr<JAXX_Scaling_Mode>("scaling_mode")
+        .Attr<bool>("is_2x")
+        .Attr<ActivationConfig>(
+            "act_params"),  // Can generalize the config later if we have more activations that need params
+    FFI_CudaGraph_Traits);
 
 pybind11::tuple GetDActDBiasQuantizeWorkspaceSizes(size_t batch_size, size_t hidden_size,
                                                    DType in_dtype, DType out_dtype,
@@ -216,7 +226,11 @@ Error_Type DActLuDBiasQuantizeFFI(cudaStream_t stream, Buffer_Type input_buf,
                                   Result_Type scale_inv_buf, Result_Type colwise_scale_inv_buf,
                                   Result_Type amax_buf, Result_Type dbias_buf,
                                   Result_Type workspace_buf, JAXX_Scaling_Mode scaling_mode,
-                                  int64_t act_enum, bool is_2x, bool is_dbias) {
+                                  int64_t act_enum, bool is_2x, bool is_dbias,
+                                  ActivationConfig act_params) {
+  // parameters for clamped swiglu used in GPT OSS
+  auto swiglu_limit = act_params.clamped_swiglu_config.limit;
+  auto swiglu_alpha = act_params.clamped_swiglu_config.alpha;
   auto in_dtype = convert_ffi_datatype_to_te_dtype(input_buf.element_type());
   auto out_dtype = convert_ffi_datatype_to_te_dtype(output_buf->element_type());
   auto workspace_dtype = convert_ffi_datatype_to_te_dtype(workspace_buf->element_type());
@@ -383,6 +397,10 @@ Error_Type DActLuDBiasQuantizeFFI(cudaStream_t stream, Buffer_Type input_buf,
       case NVTE_Activation_Type::SREGLU:
         nvte_dsreglu(input_tensor.data(), act_input_tensor.data(), output_tensor.data(), stream);
         break;
+      case NVTE_Activation_Type::CLAMPED_SWIGLU:
+        nvte_clamped_dswiglu(input_tensor.data(), act_input_tensor.data(), output_tensor.data(),
+                             swiglu_limit, swiglu_alpha, stream);
+        break;
       default:
         NVTE_ERROR("Unsupported ActivationEnum");
         break;
@@ -408,7 +426,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(DActLuDBiasQuantizeHandler, DActLuDBiasQuantizeFFI
                                   .Attr<JAXX_Scaling_Mode>("scaling_mode")
                                   .Attr<int64_t>("act_enum")
                                   .Attr<bool>("is_2x")
-                                  .Attr<bool>("is_dbias"),
+                                  .Attr<bool>("is_dbias")
+                                  .Attr<ActivationConfig>("act_params"),
                               FFI_CudaGraph_Traits);
 }  // namespace jax
 }  // namespace transformer_engine
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index afbeb644c1..08600fd3f4 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -133,6 +133,7 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
       .value("QGEGLU", NVTE_Activation_Type::QGEGLU)
       .value("SRELU", NVTE_Activation_Type::SRELU)
       .value("SREGLU", NVTE_Activation_Type::SREGLU)
+      .value("CLAMPED_SWIGLU", NVTE_Activation_Type::CLAMPED_SWIGLU)
       .export_values();
 
   pybind11::enum_<NVTE_Fused_Attn_Backend>(m, "NVTE_Fused_Attn_Backend", pybind11::module_local())
diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py
index c548c54efa..f02876d8f4 100644
--- a/transformer_engine/jax/flax/module.py
+++ b/transformer_engine/jax/flax/module.py
@@ -898,6 +898,10 @@ class LayerNormMLP(TransformerEngineBase):
     activations: Sequence[Union[str, Callable]], default = ('relu',)
         The sequence of activation functions to apply after the first dense layer transformation.
         Each activation has its own transformation layer.
+    activation_params: dict, default = None
+        The parameters needed(if any) by the activation functions specified in :attr:`activations`.
+        At the moment only ('clamped_silu', 'clamped_linear') which is clamped_swiglu used in GPT OSS
+        need additional parameters.
     intermediate_dropout_rng_name: str, default = 'dropout'
         The key in given RNGs via flax.linen.Module.apply that for generating Dropout masks.
     intermediate_dropout_rate: float, default = 0.1
@@ -956,6 +960,7 @@ class LayerNormMLP(TransformerEngineBase):
     bias_axes_2: Tuple[str, ...] = ("embed",)
     return_layernorm_output: bool = True
     activations: Sequence[Union[str, Callable]] = ("relu",)
+    activation_params: dict = None
     intermediate_dropout_rng_name: str = "dropout"
     intermediate_dropout_rate: float = 0.1
     intermediate_hidden_dropout_dims: Sequence[int] = ()
@@ -1023,6 +1028,7 @@ def __call__(self, inputs: Array, deterministic: bool = False) -> Array:
             ("relu", "linear"),
             ("quick_gelu", "linear"),
             ("squared_relu", "linear"),
+            ("clamped_silu", "clamped_linear"),
         ]
         act_pool = [("gelu",), ("silu",), ("relu",), ("quick_gelu",), ("squared_relu",)]
         normalized_acts = []
@@ -1031,7 +1037,9 @@ def __call__(self, inputs: Array, deterministic: bool = False) -> Array:
                 return False
             normalized_acts.append(act.lower())
         normalized_acts = tuple(
-            reversed(normalized_acts) if normalized_acts[0] == "linear" else normalized_acts
+            reversed(normalized_acts)
+            if (normalized_acts[0] == "linear" or normalized_acts[0] == "clamped_linear")
+            else normalized_acts
         )
 
         is_act_implemented = normalized_acts in (gated_act_pool + act_pool)
@@ -1150,6 +1158,7 @@ def kernel_1_init(key, num_kernels, stack_axis, *init_args):
                 ffn1_ckpt_name=self.ffn1_ckpt_name,
                 ffn2_ckpt_name=self.ffn2_ckpt_name,
                 activation_type=normalized_acts,
+                activation_params=self.activation_params,
                 quantizer_sets=(ffn1_quantizer_set, ffn2_quantizer_set),
             )
             out = out.reshape(*inputs.shape[: self.axis], *hidden_size_tuple)
@@ -1287,4 +1296,4 @@ def kernel_1_init(key, num_kernels, stack_axis, *init_args):
             out = checkpoint_name(out, self.ffn2_ckpt_name)
 
         assert out.dtype == input_dtype
-        return out, ln_output  # Output, layner_norm_output
+        return out, ln_output  # Output, layer_norm_output
diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py
index fb3ac7b9ae..fc72b9bc3f 100644
--- a/transformer_engine/jax/flax/transformer.py
+++ b/transformer_engine/jax/flax/transformer.py
@@ -1631,6 +1631,9 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
     mlp_activations: Sequence[str], default = ('relu', )
         The sequence of activation functions to apply after the first linear transformation.
         Each activation has its own transformation layer.
+    mlp_activation_params: dict = None
+         This is only used when ('clamped_silu', 'clamped_linear') is in :attr:`mlp_activations`. At the moment
+        ClampedSwiglu is the only activation that requires parameters.
     use_bias: bool, default = False
         Indicate whether to enable bias shifting for QKVO projections, FC1 and FC2.
         If set to False, the layer will not learn additive biases.
@@ -1751,6 +1754,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
     mha_kernel_init: Initializer = None
     mlp_kernel_init: Initializer = None
     mlp_activations: Sequence[str] = ("relu",)
+    mlp_activation_params: dict = None
     use_bias: bool = False
     bias_init: Initializer = nn.initializers.zeros
     apply_residual_connection_post_layernorm: bool = False
@@ -2045,6 +2049,7 @@ def hidden_dropout(x, deterministic):
             return_layernorm_output=self.apply_residual_connection_post_layernorm,
             intermediate_dim=self.mlp_hidden_size,
             activations=self.mlp_activations,
+            activation_params=self.mlp_activation_params,
             intermediate_dropout_rng_name=self.dropout_rng_name,
             intermediate_dropout_rate=self.intermediate_dropout,
             intermediate_hidden_dropout_dims=self.intermediate_dropout_dims,
diff --git a/transformer_engine/jax/layernorm_mlp.py b/transformer_engine/jax/layernorm_mlp.py
index e3eaa53e1d..20541e719b 100644
--- a/transformer_engine/jax/layernorm_mlp.py
+++ b/transformer_engine/jax/layernorm_mlp.py
@@ -49,6 +49,7 @@ def layernorm_mlp(
     ffn1_ckpt_name: str = "ffn1",
     ffn2_ckpt_name: str = "ffn2",
     activation_type: Sequence[Union[str, Callable]] = ("gelu",),
+    activation_params: dict = None,
     quantizer_sets: Tuple[QuantizerSet] = (noop_quantizer_set, noop_quantizer_set),
 ) -> jnp.ndarray:
     """Apply layer normalization followed by MLP block.
@@ -130,12 +131,13 @@ def layernorm_mlp(
         ffn1_ckpt_name,
         ffn2_ckpt_name,
         activation_type,
+        activation_params,
         quantizer_sets,
     )
     return output
 
 
-@partial(jax.custom_vjp, nondiff_argnums=(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))
+@partial(jax.custom_vjp, nondiff_argnums=(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18))
 def _layernorm_mlp(
     x: jnp.ndarray,
     gamma: jnp.ndarray,
@@ -155,6 +157,7 @@ def _layernorm_mlp(
     ffn1_ckpt_name: str,
     ffn2_ckpt_name: str,
     activation_type: Sequence[Union[str, Callable]],
+    activation_params: dict,
     quantizer_sets,
 ):
     """Internal implementation of layernorm_mlp with custom VJP.
@@ -204,6 +207,7 @@ def _layernorm_mlp(
         ffn1_ckpt_name,
         ffn2_ckpt_name,
         activation_type,
+        activation_params,
         quantizer_sets,
     )
     return output
@@ -228,6 +232,7 @@ def _layernorm_mlp_fwd_rule(
     ffn1_ckpt_name,
     ffn2_ckpt_name,
     activation_type,
+    activation_params,
     quantizer_sets,
 ):
     """Forward pass rule for layernorm_mlp.
@@ -306,10 +311,18 @@ def _layernorm_mlp_fwd_rule(
     dot_1_output = checkpoint_name(dot_1_output, ffn1_ckpt_name)
 
     # (batch..., hidden_in) -> (batch..., hidden)
+    # At the moment the act_params is only used for ClampedSwiglu
+    # If there are more activations that require parameters in the future
+    # we might need to change it to a more generic parameter container
     casted_act_out = tex.act_lu(
         dot_1_output,
         activation_type,
         quantizer=ffn2_quantizer_set.x,
+        act_params=(
+            tex.activation.ActivationParams.create(activation_type, **activation_params)
+            if activation_params
+            else None
+        ),
     )
 
     casted_act_out = with_sharding_constraint_by_logical_axes(casted_act_out, dot_2_input_axes)
@@ -372,6 +385,7 @@ def _layernorm_mlp_bwd_rule(
     ffn1_ckpt_name,
     ffn2_ckpt_name,
     activation_type,
+    activation_params,
     ctx,
     grad,
 ):
@@ -459,6 +473,11 @@ def _layernorm_mlp_bwd_rule(
         activation_type=activation_type,
         is_dbias=use_bias_1,
         quantizer=ffn2_quantizer_set.dgrad,
+        act_params=(
+            tex.activation.ActivationParams.create(activation_type, **activation_params)
+            if activation_params
+            else None
+        ),
     )
 
     # k_non_contracting_dims calibrated with the shape difference of grad.ndim vs kernel_1.ndim
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 4edc6d81e1..7751e86e23 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -200,6 +200,10 @@ py::object swiglu(const at::Tensor &input, py::handle quantizer);
 
 py::object dswiglu(const at::Tensor &grad, const at::Tensor &input, py::handle quantizer);
 
+py::object clamped_swiglu(const at::Tensor &input, py::handle quantizer, float limit, float alpha);
+
+py::object clamped_dswiglu(const at::Tensor &grad, const at::Tensor &input, py::handle quantizer,
+                           float limit, float alpha);
 /***************************************************************************************************
  * LayerNorm
  **************************************************************************************************/
diff --git a/transformer_engine/pytorch/csrc/extensions/activation.cpp b/transformer_engine/pytorch/csrc/extensions/activation.cpp
index 7851cc5ffc..856a597c67 100644
--- a/transformer_engine/pytorch/csrc/extensions/activation.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/activation.cpp
@@ -3,15 +3,19 @@
  *
  * See LICENSE for license information.
  ************************************************************************/
-
 #include "../extensions.h"
 #include "common.h"
 #include "pybind.h"
 
 namespace transformer_engine::pytorch {
 
-template <void (*act_func)(const NVTETensor, NVTETensor, cudaStream_t)>
-py::object activation_helper(const at::Tensor& input, py::handle quantizer, int shape_divisor = 1) {
+/* Type aliases for readability */
+using FuncType = void(const NVTETensor, NVTETensor, cudaStream_t);
+using DFuncType = void(const NVTETensor, const NVTETensor, NVTETensor, cudaStream_t);
+
+template <FuncType* act_func, auto act_func_with_args, typename... Args>
+py::object activation_helper(const at::Tensor& input, py::handle quantizer, int shape_divisor = 1,
+                             Args&&... args) {
   init_extension();
 
   // Input tensor
@@ -30,31 +34,47 @@ py::object activation_helper(const at::Tensor& input, py::handle quantizer, int
   if (quantizer.is_none() || detail::IsFloat8Quantizers(quantizer.ptr()) ||
       detail::IsMXFP8Quantizers(quantizer.ptr())) {
     // Compute activation directly
-    NVTE_SCOPED_GIL_RELEASE(
-        { act_func(input_cpp.data(), out_cpp.data(), at::cuda::getCurrentCUDAStream()); });
+    NVTE_SCOPED_GIL_RELEASE({
+      if constexpr (act_func == nullptr) {
+        act_func_with_args(input_cpp.data(), out_cpp.data(), std::forward<Args>(args)...,
+                           at::cuda::getCurrentCUDAStream());
+      } else {
+        act_func(input_cpp.data(), out_cpp.data(), at::cuda::getCurrentCUDAStream());
+      }
+    });
   } else if (detail::IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
     // Compute activation in high-precision fused together with amax, then quantize.
-
     auto quantizer_cpp_cs = dynamic_cast<Float8CurrentScalingQuantizer*>(quantizer_cpp.get());
     auto [temp_cpp, _] = quantizer_cpp_cs->create_hp_tensor_with_amax(output_shape, fake_dtype);
-    NVTE_SCOPED_GIL_RELEASE(
-        { act_func(input_cpp.data(), temp_cpp.data(), at::cuda::getCurrentCUDAStream()); });
+    NVTE_SCOPED_GIL_RELEASE({
+      if constexpr (act_func == nullptr) {
+        act_func_with_args(input_cpp.data(), temp_cpp.data(), std::forward<Args>(args)...,
+                           at::cuda::getCurrentCUDAStream());
+      } else {
+        act_func(input_cpp.data(), temp_cpp.data(), at::cuda::getCurrentCUDAStream());
+      }
+    });
     quantizer_cpp_cs->quantize_with_amax(temp_cpp, out_cpp);
   } else {
     // Compute activation in high-precision, then quantize
-
     auto [temp_cpp, _] = NoneQuantizer(py::none()).create_tensor(output_shape, fake_dtype);
-    NVTE_SCOPED_GIL_RELEASE(
-        { act_func(input_cpp.data(), temp_cpp.data(), at::cuda::getCurrentCUDAStream()); });
+    NVTE_SCOPED_GIL_RELEASE({
+      if constexpr (act_func == nullptr) {
+        act_func_with_args(input_cpp.data(), temp_cpp.data(), std::forward<Args>(args)...,
+                           at::cuda::getCurrentCUDAStream());
+      } else {
+        act_func(input_cpp.data(), temp_cpp.data(), at::cuda::getCurrentCUDAStream());
+      }
+    });
     quantizer_cpp->quantize(temp_cpp, out_cpp);
   }
 
   return out_py;
 }
 
-template <void (*dact_func)(const NVTETensor, const NVTETensor, NVTETensor, cudaStream_t)>
+template <DFuncType* dact_func, auto dact_func_with_args, typename... Args>
 py::object dactivation_helper(const at::Tensor& grad_output, const at::Tensor& input,
-                              py::handle quantizer) {
+                              py::handle quantizer, Args&&... args) {
   init_extension();
 
   // Grad output and input tensors
@@ -76,24 +96,39 @@ py::object dactivation_helper(const at::Tensor& grad_output, const at::Tensor& i
       detail::IsMXFP8Quantizers(quantizer.ptr())) {
     // Compute activation backward directly
     NVTE_SCOPED_GIL_RELEASE({
-      dact_func(grad_output_cpp.data(), input_cpp.data(), grad_input_cpp.data(),
-                at::cuda::getCurrentCUDAStream());
+      if constexpr (dact_func == nullptr) {
+        dact_func_with_args(grad_output_cpp.data(), input_cpp.data(), grad_input_cpp.data(),
+                            std::forward<Args>(args)..., at::cuda::getCurrentCUDAStream());
+      } else {
+        dact_func(grad_output_cpp.data(), input_cpp.data(), grad_input_cpp.data(),
+                  at::cuda::getCurrentCUDAStream());
+      }
     });
   } else if (detail::IsFloat8CurrentScalingQuantizers(quantizer.ptr())) {
     // Compute activation backward in high-precision fused together with amax, then quantize.
     auto quantizer_cpp_cs = dynamic_cast<Float8CurrentScalingQuantizer*>(quantizer_cpp.get());
     auto [temp_cpp, _] = quantizer_cpp_cs->create_hp_tensor_with_amax(input_shape, fake_dtype);
     NVTE_SCOPED_GIL_RELEASE({
-      dact_func(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
-                at::cuda::getCurrentCUDAStream());
+      if constexpr (dact_func == nullptr) {
+        dact_func_with_args(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
+                            std::forward<Args>(args)..., at::cuda::getCurrentCUDAStream());
+      } else {
+        dact_func(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
+                  at::cuda::getCurrentCUDAStream());
+      }
     });
     quantizer_cpp_cs->quantize_with_amax(temp_cpp, grad_input_cpp);
   } else {
     // Compute activation backward in high-precision, then quantize
     auto [temp_cpp, _] = NoneQuantizer(py::none()).create_tensor(input_shape, fake_dtype);
     NVTE_SCOPED_GIL_RELEASE({
-      dact_func(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
-                at::cuda::getCurrentCUDAStream());
+      if constexpr (dact_func == nullptr) {
+        dact_func_with_args(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
+                            std::forward<Args>(args)..., at::cuda::getCurrentCUDAStream());
+      } else {
+        dact_func(grad_output_cpp.data(), input_cpp.data(), temp_cpp.data(),
+                  at::cuda::getCurrentCUDAStream());
+      }
     });
     quantizer_cpp->quantize(temp_cpp, grad_input_cpp);
   }
@@ -101,86 +136,96 @@ py::object dactivation_helper(const at::Tensor& grad_output, const at::Tensor& i
   return grad_input_py;
 }
 
-/* GELU and variants*/
+/* GELU and variants */
 py::object gelu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_gelu>(input, quantizer);
+  return activation_helper<nvte_gelu, nullptr>(input, quantizer);
 }
 
 py::object dgelu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dgelu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dgelu, nullptr>(grad, input, quantizer);
 }
 
 py::object geglu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_geglu>(input, quantizer, 2);
+  return activation_helper<nvte_geglu, nullptr>(input, quantizer, 2);
 }
 
 py::object dgeglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dgeglu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dgeglu, nullptr>(grad, input, quantizer);
 }
 
 py::object qgelu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_qgelu>(input, quantizer);
+  return activation_helper<nvte_qgelu, nullptr>(input, quantizer);
 }
 
 py::object dqgelu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dqgelu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dqgelu, nullptr>(grad, input, quantizer);
 }
 
 py::object qgeglu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_qgeglu>(input, quantizer, 2);
+  return activation_helper<nvte_qgeglu, nullptr>(input, quantizer, 2);
 }
 
 py::object dqgeglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dqgeglu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dqgeglu, nullptr>(grad, input, quantizer);
 }
 
 /* ReLU and variants*/
 py::object relu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_relu>(input, quantizer);
+  return activation_helper<nvte_relu, nullptr>(input, quantizer);
 }
 
 py::object drelu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_drelu>(grad, input, quantizer);
+  return dactivation_helper<nvte_drelu, nullptr>(grad, input, quantizer);
 }
 
 py::object reglu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_reglu>(input, quantizer, 2);
+  return activation_helper<nvte_reglu, nullptr>(input, quantizer, 2);
 }
 
 py::object dreglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dreglu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dreglu, nullptr>(grad, input, quantizer);
 }
 
 py::object srelu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_srelu>(input, quantizer);
+  return activation_helper<nvte_srelu, nullptr>(input, quantizer);
 }
 
 py::object dsrelu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dsrelu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dsrelu, nullptr>(grad, input, quantizer);
 }
 
 py::object sreglu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_sreglu>(input, quantizer, 2);
+  return activation_helper<nvte_sreglu, nullptr>(input, quantizer, 2);
 }
 
 py::object dsreglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dsreglu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dsreglu, nullptr>(grad, input, quantizer);
 }
-
-/* Silu and variants*/
+/* Silu and variants */
 py::object silu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_silu>(input, quantizer);
+  return activation_helper<nvte_silu, nullptr>(input, quantizer);
 }
 
 py::object dsilu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dsilu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dsilu, nullptr>(grad, input, quantizer);
 }
 
 py::object swiglu(const at::Tensor& input, py::handle quantizer) {
-  return activation_helper<nvte_swiglu>(input, quantizer, 2);
+  return activation_helper<nvte_swiglu, nullptr>(input, quantizer, 2);
 }
 
 py::object dswiglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer) {
-  return dactivation_helper<nvte_dswiglu>(grad, input, quantizer);
+  return dactivation_helper<nvte_dswiglu, nullptr>(grad, input, quantizer);
+}
+
+/* clamped functions */
+py::object clamped_swiglu(const at::Tensor& input, py::handle quantizer, float limit, float alpha) {
+  return activation_helper<nullptr, nvte_clamped_swiglu>(input, quantizer, 2, limit, alpha);
+}
+
+py::object clamped_dswiglu(const at::Tensor& grad, const at::Tensor& input, py::handle quantizer,
+                           float limit, float alpha) {
+  return dactivation_helper<nullptr, nvte_clamped_dswiglu>(grad, input, quantizer, limit, alpha);
 }
+
 }  // namespace transformer_engine::pytorch
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
index 7649ccb6d6..ae6575914c 100644
--- a/transformer_engine/pytorch/csrc/extensions/pybind.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -136,6 +136,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("quantizer"));
   m.def("swiglu", transformer_engine::pytorch::swiglu, "SwiGLU activation", py::arg("input"),
         py::arg("quantizer"));
+  m.def("clamped_swiglu", transformer_engine::pytorch::clamped_swiglu,
+        "SwiGLU activation used in GPT OSS", py::arg("input"), py::arg("quantizer"),
+        py::arg("limit") = 7.0f, py::arg("alpha") = 1.702f);
   /* Backward of GELU and variants */
   m.def("dgelu", transformer_engine::pytorch::dgelu, "Backward of GeLU", py::arg("grad"),
         py::arg("fwd_input"), py::arg("quantizer"));
@@ -159,6 +162,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("fwd_input"), py::arg("quantizer"));
   m.def("dswiglu", transformer_engine::pytorch::dswiglu, "Backward of SwiGLU", py::arg("grad"),
         py::arg("fwd_input"), py::arg("quantizer"));
+  m.def("clamped_dswiglu", transformer_engine::pytorch::clamped_dswiglu,
+        "Backward of SwiGLU used in GPT OSS", py::arg("grad"), py::arg("fwd_input"),
+        py::arg("quantizer"), py::arg("limit") = 7.0f, py::arg("alpha") = 1.702f);
   /* DBias + DAct fusions*/
   m.def("dbias_dgelu", transformer_engine::pytorch::dbias_dgelu, "DGeLU + DBias + Quantize",
         py::arg("grad"), py::arg("fwd_input"), py::arg("quantizer"));
diff --git a/transformer_engine/pytorch/ops/basic/__init__.py b/transformer_engine/pytorch/ops/basic/__init__.py
index 2c903675fb..28d49bf7b9 100644
--- a/transformer_engine/pytorch/ops/basic/__init__.py
+++ b/transformer_engine/pytorch/ops/basic/__init__.py
@@ -4,7 +4,19 @@
 
 """Single tensor operations supported by the operation fuser."""
 
-from .activation import GELU, GEGLU, QGELU, QGEGLU, ReLU, ReGLU, SReLU, SReGLU, SiLU, SwiGLU
+from .activation import (
+    GELU,
+    GEGLU,
+    QGELU,
+    QGEGLU,
+    ReLU,
+    ReGLU,
+    SReLU,
+    SReGLU,
+    SiLU,
+    SwiGLU,
+    ClampedSwiGLU,
+)
 from .add_extra_input import AddExtraInput
 from .all_gather import AllGather
 from .all_reduce import AllReduce
diff --git a/transformer_engine/pytorch/ops/basic/activation.py b/transformer_engine/pytorch/ops/basic/activation.py
index 22779b6017..8a754c6382 100644
--- a/transformer_engine/pytorch/ops/basic/activation.py
+++ b/transformer_engine/pytorch/ops/basic/activation.py
@@ -28,6 +28,7 @@
     "SReGLU",
     "SiLU",
     "SwiGLU",
+    "ClampedSwiGLU",
 ]
 
 
@@ -392,3 +393,38 @@ def _activation_forward_impl(self, *args, **kwargs) -> torch.Tensor:
 
     def _activation_backward_impl(self, *args, **kwargs) -> torch.Tensor:
         return tex.dswiglu(*args, **kwargs)
+
+
+class ClampedSwiGLU(_ActivationOperation):
+    r"""GPT-OSS
+    Implementation based on `GPT-OSS<https://github.com/openai/gpt-oss/blob/a0a84273e9e0c14a233cb9befdfd159c2bcfa6cd/gpt_oss/torch/model.py#L250>`__.
+
+    This activation has two differences compared to the original SwiGLU
+       1. Both gate and pre-activations are clipped based on parameter limit.
+       2. Activation uses sigmoid(alpha * x) instead of sigmoid(x) used in Swish activation.
+
+    .. warning::    The input tensor is chunked along the last dimension to get gates/pre-activations which is differnt
+    from GPT OSS implementation where the gates/pre-activations are assumed to be interleaved in the input tensor.
+
+    Parameters
+    ----------
+    limit: float
+        The clamp limit.
+    alpha: float
+        The scaling factor for the sigmoid function used in the activation.
+    cache_quantized_input: bool, default = False
+        Quantize input tensor when caching for use in the backward pass.
+    """
+
+    def __init__(
+        self, *, limit: float = 7.0, alpha: float = 1.702, cache_quantized_input: bool = False
+    ):
+        super().__init__(cache_quantized_input=cache_quantized_input)
+        self.limit = limit
+        self.alpha = alpha
+
+    def _activation_forward_impl(self, *args, **kwargs) -> torch.Tensor:
+        return tex.clamped_swiglu(*args, limit=self.limit, alpha=self.alpha, **kwargs)
+
+    def _activation_backward_impl(self, *args, **kwargs) -> torch.Tensor:
+        return tex.clamped_dswiglu(*args, limit=self.limit, alpha=self.alpha, **kwargs)