From b91a8ef93a77f6acd66788a2791e3dbe10547096 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 26 May 2026 16:13:50 -0700
Subject: [PATCH 1/2] Cortex-M backend: add quantized_activation op with LUT
 lowering for sigmoid/tanh/silu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CMSIS-NN has no s8 activation primitive — the s16 path requantizes around an
on-target polynomial, which costs an extra s8 → s16 → activation → s8 trip
per call. Instead this lowers standalone aten.sigmoid / aten.tanh / aten.silu
to a single cortex_m.quantized_activation(input, lut) op backed by a 256-entry
int8 LUT precomputed at AoT from the input/output qparams and the activation
function. The kernel is a single byte-indexed lookup loop -- shape-agnostic,
activation-agnostic, and free of any runtime requantization. Encoding the
activation in the LUT bytes rather than a kind enum keeps the kernel surface
to one op.

For SiLU specifically, the LUT can encode `x * sigmoid(x)` directly, so the
naive sigmoid-plus-elementwise-mul decomposition is unnecessary. aten.silu
is added to the to_edge preserve_ops list so it doesn't decompose to sigmoid+mul
before the lowering pass sees it; this is set globally because no per-test
opt-out exists today.

LUT-build numerics deliberately mirror the existing cortex_m CMSIS-NN
conventions. Sigmoid/silu use a sign-branched stable form that always
exponentiates a non-positive value, so the LUT build can't trip OverflowError
for unusually wide input qparams. The final fp → int8 quantize uses
round-half-away-from-zero, matching the rounding requantize_cmsis applies
after its right-shift in passes_utils.

In Silero VAD the final `sigmoid(final_conv(x))` now lowers; the 3 remaining
sigmoids and 2 tanhs are LSTMCell gates and stay in aten because PyTorch
export captures nn.LSTMCell as a single high-level op -- the quantizer never
sees the gates and can't annotate them, and to_edge only decomposes the cell
after the quantizer has run. test_lstm_cell.py captures the expected
end-state as an xfail that will flip green once a pre-annotation decompose
pass lands; that work is tracked as a separate follow-up.

Other activations (GELU for KWT, Mish, ELU, Softplus) plug in as a few
additional entries in passes_utils._ACTIVATION_FNS plus matching quantizer
patterns. The generic op + LUT design carries them with no kernel changes.

Co-authored-by: Claude <noreply@anthropic.com>
---
 backends/cortex_m/CMakeLists.txt              |   1 +
 .../cortex_m/ops/op_quantized_activation.cpp  |  53 ++++++
 backends/cortex_m/ops/operators.py            |  29 ++++
 backends/cortex_m/ops/operators.yaml          |   6 +
 .../passes/convert_to_cortex_m_pass.py        |  50 +++++-
 backends/cortex_m/passes/passes_utils.py      |  58 +++++++
 .../cortex_m/quantizer/pattern_checkers.py    |  19 +++
 .../cortex_m/quantizer/quantizer_support.py   |   8 +
 .../cortex_m/test/models/test_lstm_cell.py    |  58 +++++++
 .../cortex_m/test/models/test_silero_vad.py   |  20 ++-
 .../test/ops/test_activation_quant.py         | 152 ++++++++++++++++++
 backends/cortex_m/test/tester.py              |   8 +
 12 files changed, 456 insertions(+), 6 deletions(-)
 create mode 100644 backends/cortex_m/ops/op_quantized_activation.cpp
 create mode 100644 backends/cortex_m/test/models/test_lstm_cell.py
 create mode 100644 backends/cortex_m/test/ops/test_activation_quant.py

diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 627406c1935..f88a6306fed 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -81,6 +81,7 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_activation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_avg_pool2d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_batch_matmul.cpp
diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp
new file mode 100644
index 00000000000..454dc76bc8e
--- /dev/null
+++ b/backends/cortex_m/ops/op_quantized_activation.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+namespace cortex_m {
+namespace native {
+
+// cppcheck-suppress unusedFunction
+Tensor& quantized_activation_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Tensor& lut,
+    Tensor& out) {
+  ET_CHECK_MSG(
+      input.scalar_type() == ScalarType::Char,
+      "quantized_activation: input must be int8");
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Char,
+      "quantized_activation: output must be int8");
+  ET_CHECK_MSG(
+      lut.scalar_type() == ScalarType::Char,
+      "quantized_activation: lut must be int8");
+  ET_CHECK_MSG(
+      lut.numel() == 256,
+      "quantized_activation: lut must have 256 entries, got %" PRId64,
+      static_cast<int64_t>(lut.numel()));
+  ET_CHECK_MSG(
+      input.numel() == out.numel(),
+      "quantized_activation: input and output must have the same numel");
+
+  const int8_t* in_data = input.const_data_ptr<int8_t>();
+  const int8_t* lut_data = lut.const_data_ptr<int8_t>();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+
+  // Bias the signed int8 input by 128 to use it as an unsigned table index;
+  // the LUT entries are precomputed AoT from the input/output qparams and the
+  // activation function (sigmoid / tanh / silu / ...), so the kernel does not
+  // need to know which activation it is implementing.
+  const int64_t n = input.numel();
+  for (int64_t i = 0; i < n; ++i) {
+    out_data[i] = lut_data[static_cast<uint8_t>(in_data[i] + 128)];
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index d4393bc7ada..4c6fb44e89d 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -264,6 +264,35 @@ def quantized_mul_impl(
     return result
 
 
+# ===================================================================
+# QUANTIZED ACTIVATION (LUT) OPERATION DEFINITION
+# ===================================================================
+# Generic table-lookup activation. The 256-entry int8 LUT is precomputed AoT
+# from the input/output qparams and the activation function (sigmoid, tanh,
+# silu, ...), so the kernel is identical regardless of which activation it
+# evaluates: out[i] = lut[input[i] + 128].
+lib.define("quantized_activation(Tensor input, Tensor lut) -> Tensor")
+lib.define(
+    "quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
+@register_fake("cortex_m::quantized_activation")  # type: ignore[misc]
+def quantized_activation_meta(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.int8, "quantized_activation input must be int8"
+    assert lut.dtype == torch.int8 and lut.numel() == 256, (
+        "quantized_activation lut must be int8 with 256 entries; "
+        f"got dtype={lut.dtype}, numel={lut.numel()}"
+    )
+    return torch.empty_like(input)
+
+
+@impl(lib, "quantized_activation", "CompositeExplicitAutograd")  # type: ignore[misc]
+def quantized_activation_impl(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    indices = input.to(torch.int32) + 128
+    return lut[indices].to(torch.int8)
+
+
 # ===================================================================
 # QUANTIZED BATCH MATMUL OPERATION DEFINITION
 # ===================================================================
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index 8db109dea43..8eacf2f49b9 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -29,6 +29,12 @@
     - arg_meta: null
       kernel_name: cortex_m::quantized_mul_out
 
+- func: cortex_m::quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_activation_out
+
 - func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index 5704645caf8..81cb5498217 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -13,7 +13,10 @@
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 
 from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass
-from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+from executorch.backends.cortex_m.passes.passes_utils import (
+    build_activation_lut,
+    quantize_multiplier_aot,
+)
 from executorch.backends.cortex_m.passes.scratch_buffer_sizes import (
     required_cmsis_nn_buffer_sizes,
 )
@@ -483,6 +486,45 @@ def _get_bmm_replacement(self, node):
         )
         return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args
 
+    _ACTIVATION_KINDS = {
+        exir_ops.edge.aten.sigmoid.default: "sigmoid",
+        exir_ops.edge.aten.tanh.default: "tanh",
+        exir_ops.edge.aten.silu.default: "silu",
+    }
+
+    def _get_activation_replacement(self, node):
+        """Lower a standalone quantized sigmoid / tanh / silu to a single
+        cortex_m.quantized_activation call backed by an AoT-built 256-entry
+        int8 LUT. The kernel is shape-agnostic; the LUT encodes both the
+        activation function and the input/output qparams.
+        """
+        input_qparams = node.meta["input_qparams"][0]
+        output_qparams = node.meta["output_qparams"][0]
+        kind = self._ACTIVATION_KINDS[node.target]
+        lut_tensor = build_activation_lut(
+            kind,
+            float(input_qparams.scale),
+            int(input_qparams.zp),
+            float(output_qparams.scale),
+            int(output_qparams.zp),
+        )
+
+        # Constant placeholders must appear before user-input placeholders;
+        # anchor on the first existing placeholder so the new LUT lands in the
+        # constant-placeholder block at the top of the graph.
+        first_placeholder = next(n for n in node.graph.nodes if n.op == "placeholder")
+        with node.graph.inserting_before(first_placeholder):
+            lut_node = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_lut",
+                InputKind.PARAMETER,
+                lut_tensor,
+            )
+
+        new_args = (node.args[0], lut_node)
+        return exir_ops.edge.cortex_m.quantized_activation.default, new_args
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
         for node in graph_module.graph.nodes:
@@ -506,6 +548,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                         op, args = self._get_convolution_replacement(node)
                 case exir_ops.edge.aten.bmm.default:
                     op, args = self._get_bmm_replacement(node)
+                case (
+                    exir_ops.edge.aten.sigmoid.default
+                    | exir_ops.edge.aten.tanh.default
+                    | exir_ops.edge.aten.silu.default
+                ):
+                    op, args = self._get_activation_replacement(node)
                 case _:
                     continue
 
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
index fcbfa301b06..64169e07521 100644
--- a/backends/cortex_m/passes/passes_utils.py
+++ b/backends/cortex_m/passes/passes_utils.py
@@ -190,6 +190,64 @@ def is_qualified_int8_node(args) -> bool:
         return False
 
 
+def _stable_sigmoid(x: float) -> float:
+    # Always exponentiate the non-positive value so `math.exp` never overflows
+    # for unusually large `|x|` (e.g. wide-range input qparams). Algebraically
+    # identical to `1 / (1 + exp(-x))`.
+    if x >= 0:
+        return 1.0 / (1.0 + math.exp(-x))
+    e = math.exp(x)
+    return e / (1.0 + e)
+
+
+def _stable_silu(x: float) -> float:
+    return x * _stable_sigmoid(x)
+
+
+_ACTIVATION_FNS = {
+    "sigmoid": _stable_sigmoid,
+    "tanh": math.tanh,
+    "silu": _stable_silu,
+}
+
+
+def _round_half_away_from_zero(x: float) -> int:
+    # Matches the rounding convention `requantize_cmsis` (above) applies after
+    # the right-shift step: ties on positive values round toward +∞, ties on
+    # negative values round toward -∞. Python's built-in `round` would use
+    # banker's rounding instead and disagree at exact half-integers.
+    return int(math.copysign(math.floor(abs(x) + 0.5), x)) if x != 0 else 0
+
+
+def build_activation_lut(
+    kind: str,
+    input_scale: float,
+    input_zp: int,
+    output_scale: float,
+    output_zp: int,
+) -> torch.Tensor:
+    """AoT-compute a 256-entry int8 lookup table for a quantized activation.
+
+    The LUT is indexed by the input byte value biased by 128: for any int8
+    input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output.
+    Because the LUT is computed in float and quantized once per entry, the
+    runtime kernel is a single memory-lookup with no requantization math.
+    """
+    if kind not in _ACTIVATION_FNS:
+        raise ValueError(
+            f"build_activation_lut: unknown activation '{kind}' "
+            f"(supported: {sorted(_ACTIVATION_FNS)})"
+        )
+    f = _ACTIVATION_FNS[kind]
+    lut = torch.empty(256, dtype=torch.int8)
+    for q in range(-128, 128):
+        x = (q - input_zp) * input_scale
+        y = f(x)
+        q_out = _round_half_away_from_zero(y / output_scale + output_zp)
+        lut[q + 128] = max(-128, min(127, q_out))
+    return lut
+
+
 def quantize_multiplier_aot(scale: float) -> tuple[int, int]:
     if scale == 0.0:
         return 0, 0
diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py
index 860d8345607..5715ca042de 100644
--- a/backends/cortex_m/quantizer/pattern_checkers.py
+++ b/backends/cortex_m/quantizer/pattern_checkers.py
@@ -99,6 +99,25 @@ def check_quantization_config(
         return is_int8
 
 
+class CortexMActivationCheck(PatternCheck):
+    """Accept standalone elementwise activations (sigmoid / tanh / silu)
+    that the LUT-based cortex_m.quantized_activation op handles uniformly.
+
+    The kernel is shape-agnostic and the LUT is computed AoT from per-tensor
+    qparams, so the only thing to enforce is int8 per-tensor quantization.
+    """
+
+    @classmethod
+    def check_quantization_config(
+        cls, pattern: list[Node], quantization_config: QuantizationConfig
+    ) -> bool:
+        is_int8 = cls.is_int8_activations(quantization_config)
+        is_per_tensor = cls.is_per_tensor(
+            quantization_config.get_input_act_qspec()
+        ) and cls.is_per_tensor(quantization_config.get_output_act_qspec())
+        return is_int8 and is_per_tensor
+
+
 class CortexMSoftmaxCheck(PatternCheck):
 
     @classmethod
diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py
index 3dfbb67638a..317189a5f3e 100644
--- a/backends/cortex_m/quantizer/quantizer_support.py
+++ b/backends/cortex_m/quantizer/quantizer_support.py
@@ -5,6 +5,7 @@
 
 import torch
 from executorch.backends.cortex_m.quantizer.pattern_checkers import (
+    CortexMActivationCheck,
     CortexMAddMulCheck,
     CortexMAvgPool2DCheck,
     CortexMBmmCheck,
@@ -119,6 +120,12 @@
     (torch.ops.aten.softmax.int,): CortexMSoftmaxCheck,
 }
 
+ACTIVATION_OP_PATTERNS = {
+    (torch.ops.aten.sigmoid.default,): CortexMActivationCheck,
+    (torch.ops.aten.tanh.default,): CortexMActivationCheck,
+    (torch.ops.aten.silu.default,): CortexMActivationCheck,
+}
+
 POOL_OP_PATTERNS = {
     (torch.ops.aten.avg_pool2d.default,): CortexMAvgPool2DCheck,
     (torch.ops.aten.max_pool2d.default,): CortexMMaxPool2DCheck,
@@ -161,4 +168,5 @@
     | CONV_TRANSPOSE_OP_PATTERNS
     | POOL_OP_PATTERNS
     | BMM_OP_PATTERNS
+    | ACTIVATION_OP_PATTERNS
 )
diff --git a/backends/cortex_m/test/models/test_lstm_cell.py b/backends/cortex_m/test/models/test_lstm_cell.py
new file mode 100644
index 00000000000..c79574f955b
--- /dev/null
+++ b/backends/cortex_m/test/models/test_lstm_cell.py
@@ -0,0 +1,58 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Regression guard for the LSTMCell-gate lowering unblocker.
+
+`nn.LSTMCell` is exported as a single high-level op, so the quantizer never
+sees the gate sigmoids/tanhs and they end up unannotated in the edge graph.
+The lowering pass correctly skips them, but the user-facing effect is that
+Silero VAD's LSTM gates stay in aten even after the quantized_activation
+op lands.
+
+Unblocking this requires a pre-annotation decompose pass that splits the
+LSTMCell into linear + split + sigmoid + tanh + add + mul *before* the
+quantizer annotates. When that lands, the four sigmoids + one tanh inside
+this test's LSTMCell will lower to cortex_m.quantized_activation and this
+test will pass -- at which point the xfail can be removed and the Silero
+expectations updated.
+"""
+
+import pytest
+import torch
+
+from executorch.backends.cortex_m.test.tester import CortexMTester
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@pytest.mark.xfail(
+    reason="nn.LSTMCell is captured as a high-level op at export, so the "
+    "quantizer doesn't annotate the gate activations. Needs a "
+    "pre-annotation decompose pass to unblock.",
+    strict=True,
+)
+def test_lstm_cell_gates_lower():
+    hidden = 8
+    model = torch.nn.LSTMCell(hidden, hidden).eval()
+    x = torch.randn(1, hidden)
+    h = torch.zeros(1, hidden)
+    c = torch.zeros(1, hidden)
+
+    tester = CortexMTester(model, (x, (h, c)))
+    tester.quantize(None).export().to_edge().run_passes()
+
+    gm = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
+    quantized_activations = [
+        n
+        for n in gm.graph.nodes
+        if n.op == "call_function"
+        and n.target == exir_ops.edge.cortex_m.quantized_activation.default
+    ]
+    # An LSTMCell has 3 sigmoid gates (i, f, o) + 1 tanh gate (g) + 1 output
+    # tanh = 5 activation calls; all should lower once the decompose pass
+    # makes them visible to the quantizer.
+    assert len(quantized_activations) == 5, (
+        f"expected 5 quantized_activation nodes (3 sigmoid gates + 2 tanh), "
+        f"got {len(quantized_activations)}"
+    )
diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py
index 27b958627bb..9793f94f2c6 100644
--- a/backends/cortex_m/test/models/test_silero_vad.py
+++ b/backends/cortex_m/test/models/test_silero_vad.py
@@ -36,9 +36,18 @@
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
     "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 15,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14,
 }
+# The final `sigmoid(final_conv(x))` now lowers to cortex_m.quantized_activation.
+# The 3 remaining sigmoids and 2 tanhs are LSTMCell gates: PyTorch export
+# captures nn.LSTMCell as a single high-level op, so the quantizer never sees
+# the gate activations and can't annotate them. They're decomposed only at
+# to_edge -- which runs after the quantizer, so by then the gates have no
+# qparams to fold and the lowering pass correctly skips them. The unblocker
+# is a pre-annotation decompose pass that splits nn.LSTMCell into linear +
+# split + sigmoid + tanh + add + mul *before* prepare_pt2e runs; tracked as
+# the LSTMCell verification follow-up.
 ops_after_transforms: dict[str, int] = {
     "executorch_exir_dialects_edge__ops_aten_abs_default": 2,
     "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2,
@@ -52,7 +61,7 @@
     "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2,
     "executorch_exir_dialects_edge__ops_aten_relu_default": 5,
     "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2,
-    "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4,
+    "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 3,
     "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
     "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1,
@@ -61,8 +70,9 @@
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
     "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6,
-    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 7,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 7,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
 }
 
diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py
new file mode 100644
index 00000000000..24e0294cf85
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_activation_quant.py
@@ -0,0 +1,152 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+
+# A single per-op `ops_after_transforms` shape is enough: every supported
+# activation lowers to exactly one cortex_m.quantized_activation, with the
+# AoT LUT stored as a constant placeholder and a single quant/dequant pair
+# at the graph boundary.
+_OPS_BEFORE = {
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+}
+_OPS_AFTER = {
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+}
+
+
+class _Sigmoid(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.sigmoid(x)
+
+
+class _Tanh(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_tanh_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.tanh(x)
+
+
+class _SiLU(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_silu_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.nn.functional.silu(x)
+
+
+import torch as _torch
+
+
+def _zero_input(shape):
+    return _torch.zeros(shape, dtype=_torch.float32)
+
+
+# Wide-magnitude inputs exercise the `max(-128, min(127, q_out))` clamp inside
+# build_activation_lut; shifted-ramp inputs push the quantizer to pick a
+# non-zero `input_zp`, exercising the `(q - input_zp) * input_scale` term in
+# the LUT formula; all-zero inputs pin down the lut entry at `input_zp + 128`.
+test_cases = {
+    "sigmoid_rank1": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-6, 6, (16,)),),
+    ),
+    "sigmoid_rank4": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),),
+    ),
+    "sigmoid_saturating": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-50, 50, (32,)),),
+    ),
+    "sigmoid_asymmetric_zp": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-1, 9, (16,)),),
+    ),
+    "sigmoid_zero": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+    "tanh_rank1": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-3, 3, (16,)),),
+    ),
+    "tanh_rank3": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-2, 2, (1, 4, 16)),),
+    ),
+    "tanh_saturating": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-30, 30, (32,)),),
+    ),
+    "tanh_asymmetric_zp": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-1, 5, (16,)),),
+    ),
+    "tanh_zero": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+    "silu_rank1": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-6, 6, (16,)),),
+    ),
+    "silu_rank4": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),),
+    ),
+    "silu_saturating": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-50, 50, (32,)),),
+    ),
+    "silu_asymmetric_zp": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-1, 9, (16,)),),
+    ),
+    "silu_zero": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+}
+
+
+@parametrize("test_case", test_cases)
+def test_dialect_quantized_activation(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms,
+        test_case.model.ops_after_transforms,
+        qtol=1,
+    )
+
+
+@parametrize("test_case", test_cases)
+def test_implementation_quantized_activation(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation(qtol=1)
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
index e9912d03cad..5a56ad62e92 100644
--- a/backends/cortex_m/test/tester.py
+++ b/backends/cortex_m/test/tester.py
@@ -42,6 +42,14 @@ def __init__(self):
                 torch.ops.aten.hardsigmoid_.default,
                 torch.ops.aten.hardswish.default,
                 torch.ops.aten.hardswish_.default,
+                # silu naturally decomposes to sigmoid*x at the to_edge step.
+                # Preserve it so the LUT lowering can collapse it into a single
+                # cortex_m.quantized_activation call rather than emitting an
+                # extra elementwise mul. Set globally because no per-test
+                # opt-out exists today; any new cortex_m test that uses SiLU
+                # must therefore expect a single aten.silu op in the edge graph
+                # (not sigmoid+mul).
+                torch.ops.aten.silu.default,
             ],
             _check_ir_validity=False,
             _core_aten_ops_exception_list=[torch.ops.aten.max_pool2d.default],

From 5045ac20c86e06668c2eef8c6cce4840ef48808e Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Wed, 27 May 2026 14:00:55 -0700
Subject: [PATCH 2/2] Cortex-M backend: address review feedback on
 quantized_activation

Adrian's three review comments on #19792, plus SIMD acceleration of the
LUT lookup (his comment asked for vector intrinsics and loop unrolling):

* Drop the target -> string indirection in the activation lowering.
  `passes_utils._ACTIVATION_FNS` now keys directly on the edge op target
  (`exir_ops.edge.aten.{sigmoid,tanh,silu}.default`), and
  `ConvertToCortexMPass._get_activation_replacement` passes `node.target`
  straight into `build_activation_lut` -- no `_ACTIVATION_KINDS` dict and no
  string round-trip.

* Replace the scalar LUT-lookup loop with three compile-gated paths:
  - M55/M85 (MVE): 16 lanes per iteration -- `vldrbq_u8` load, `vaddq_n_u8`
    to bias by 128, `vldrbq_gather_offset_s8` to gather the LUT result,
    `vstrbq_s8` to store.
  - M4/M7 (DSP, no MVE): 4 bytes per iteration -- fold four byte-loads into
    one word-load, batch the +128 bias with `__uadd8`, four LUT lookups
    (no M-class gather instruction exists), fold four byte-stores into one
    word-store. Uses `<arm_acle.h>` and local memcpy helpers rather than
    pulling in the heavyweight `arm_nnsupportfunctions.h`.
  - All other cores (M0+/M3): a 4x-unrolled scalar tail, which also handles
    the sub-vector remainder of the two SIMD paths.

* Switch the source header to Meta's standard copyright block to match
  the other cortex_m op files.

Also drop test_lstm_cell.py: the LSTMCell gates can't lower until a
pre-annotation decompose pass lands, so the test isn't ready and is removed
until that follow-up work is done.

The MVE path is verified on the Corstone-300 FVP (cortex-m55) via the
existing test_implementation_quantized_activation suite. The three paths
were cross-compiled for cortex-m0plus / m4 / m7 / m55; the M4 build emits
`uadd8` and the M55 build emits the MVE gather.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../cortex_m/ops/op_quantized_activation.cpp  | 88 +++++++++++++++++--
 .../passes/convert_to_cortex_m_pass.py        |  9 +-
 backends/cortex_m/passes/passes_utils.py      | 19 ++--
 .../cortex_m/test/models/test_lstm_cell.py    | 58 ------------
 .../test/ops/test_activation_quant.py         |  2 +-
 5 files changed, 96 insertions(+), 80 deletions(-)
 delete mode 100644 backends/cortex_m/test/models/test_lstm_cell.py

diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp
index 454dc76bc8e..fb9b4768acf 100644
--- a/backends/cortex_m/ops/op_quantized_activation.cpp
+++ b/backends/cortex_m/ops/op_quantized_activation.cpp
@@ -1,5 +1,6 @@
 /*
- * Copyright 2026 Arm Limited and/or its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -7,9 +8,39 @@
 
 #include "cortex_m_ops_common.h"
 
+#include <cstring>
+
+#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1)
+#include <arm_mve.h>
+#define HAS_HELIUM_SIMD 1
+#endif
+
+#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD)
+#include <arm_acle.h>
+#define HAS_DSP_PACKED_LUT 1
+#endif
+
 namespace cortex_m {
 namespace native {
 
+#if defined(HAS_DSP_PACKED_LUT)
+// Local 4-byte read/write helpers. We deliberately don't include
+// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia`
+// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire
+// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers.
+static inline uint32_t read_u8x4_ia(const int8_t** in) {
+  uint32_t val;
+  std::memcpy(&val, *in, 4);
+  *in += 4;
+  return val;
+}
+
+static inline void write_u8x4_ia(int8_t** out, uint32_t val) {
+  std::memcpy(*out, &val, 4);
+  *out += 4;
+}
+#endif
+
 // cppcheck-suppress unusedFunction
 Tensor& quantized_activation_out(
     KernelRuntimeContext& context,
@@ -37,12 +68,59 @@ Tensor& quantized_activation_out(
   const int8_t* lut_data = lut.const_data_ptr<int8_t>();
   int8_t* out_data = out.mutable_data_ptr<int8_t>();
 
-  // Bias the signed int8 input by 128 to use it as an unsigned table index;
-  // the LUT entries are precomputed AoT from the input/output qparams and the
+  // The LUT is precomputed AoT from the input/output qparams and the
   // activation function (sigmoid / tanh / silu / ...), so the kernel does not
-  // need to know which activation it is implementing.
+  // need to know which activation it is implementing. The signed int8 input
+  // is biased by 128 to use it as an unsigned [0, 255] table index.
   const int64_t n = input.numel();
-  for (int64_t i = 0; i < n; ++i) {
+  int64_t i = 0;
+
+#if defined(HAS_HELIUM_SIMD)
+  // M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8
+  // (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then
+  // gather-load the int8 result from the LUT.
+  for (; i + 15 < n; i += 16) {
+    uint8x16_t in_u8 =
+        vldrbq_u8(reinterpret_cast<const uint8_t*>(in_data + i));
+    uint8x16_t idx = vaddq_n_u8(in_u8, 128);
+    int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx);
+    vstrbq_s8(out_data + i, result);
+  }
+#elif defined(HAS_DSP_PACKED_LUT)
+  // M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from
+  // (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias
+  // with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The
+  // LUT lookups themselves still hit memory four times per word -- no DSP
+  // gather instruction exists on M-class.
+  const int8_t* in_ptr = in_data;
+  int8_t* out_ptr = out_data;
+  const int64_t word_iters = n >> 2;
+  for (int64_t w = 0; w < word_iters; ++w) {
+    const uint32_t in_word = read_u8x4_ia(&in_ptr);
+    const uint32_t idx_word = __uadd8(in_word, 0x80808080u);
+    const uint32_t out_word =
+        static_cast<uint32_t>(static_cast<uint8_t>(lut_data[idx_word & 0xFFu])) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 8) & 0xFFu]))
+         << 8) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 16) & 0xFFu]))
+         << 16) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 24) & 0xFFu]))
+         << 24);
+    write_u8x4_ia(&out_ptr, out_word);
+  }
+  i = word_iters << 2;
+#endif
+
+  // 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll
+  // lets the compiler issue independent LUT loads; on the MVE / DSP paths
+  // above this only runs for the < 16- (or < 4-) element remainder.
+  for (; i + 3 < n; i += 4) {
+    out_data[i + 0] = lut_data[static_cast<uint8_t>(in_data[i + 0] + 128)];
+    out_data[i + 1] = lut_data[static_cast<uint8_t>(in_data[i + 1] + 128)];
+    out_data[i + 2] = lut_data[static_cast<uint8_t>(in_data[i + 2] + 128)];
+    out_data[i + 3] = lut_data[static_cast<uint8_t>(in_data[i + 3] + 128)];
+  }
+  for (; i < n; ++i) {
     out_data[i] = lut_data[static_cast<uint8_t>(in_data[i] + 128)];
   }
 
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index 81cb5498217..24cc85bac66 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -486,12 +486,6 @@ def _get_bmm_replacement(self, node):
         )
         return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args
 
-    _ACTIVATION_KINDS = {
-        exir_ops.edge.aten.sigmoid.default: "sigmoid",
-        exir_ops.edge.aten.tanh.default: "tanh",
-        exir_ops.edge.aten.silu.default: "silu",
-    }
-
     def _get_activation_replacement(self, node):
         """Lower a standalone quantized sigmoid / tanh / silu to a single
         cortex_m.quantized_activation call backed by an AoT-built 256-entry
@@ -500,9 +494,8 @@ def _get_activation_replacement(self, node):
         """
         input_qparams = node.meta["input_qparams"][0]
         output_qparams = node.meta["output_qparams"][0]
-        kind = self._ACTIVATION_KINDS[node.target]
         lut_tensor = build_activation_lut(
-            kind,
+            node.target,
             float(input_qparams.scale),
             int(input_qparams.zp),
             float(output_qparams.scale),
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
index 64169e07521..24e2da95dba 100644
--- a/backends/cortex_m/passes/passes_utils.py
+++ b/backends/cortex_m/passes/passes_utils.py
@@ -205,9 +205,9 @@ def _stable_silu(x: float) -> float:
 
 
 _ACTIVATION_FNS = {
-    "sigmoid": _stable_sigmoid,
-    "tanh": math.tanh,
-    "silu": _stable_silu,
+    exir_ops.edge.aten.sigmoid.default: _stable_sigmoid,
+    exir_ops.edge.aten.tanh.default: math.tanh,
+    exir_ops.edge.aten.silu.default: _stable_silu,
 }
 
 
@@ -220,7 +220,7 @@ def _round_half_away_from_zero(x: float) -> int:
 
 
 def build_activation_lut(
-    kind: str,
+    target,
     input_scale: float,
     input_zp: int,
     output_scale: float,
@@ -228,17 +228,20 @@ def build_activation_lut(
 ) -> torch.Tensor:
     """AoT-compute a 256-entry int8 lookup table for a quantized activation.
 
+    `target` is the edge-dialect op being lowered (e.g.
+    `exir_ops.edge.aten.sigmoid.default`).
+
     The LUT is indexed by the input byte value biased by 128: for any int8
     input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output.
     Because the LUT is computed in float and quantized once per entry, the
     runtime kernel is a single memory-lookup with no requantization math.
     """
-    if kind not in _ACTIVATION_FNS:
+    if target not in _ACTIVATION_FNS:
         raise ValueError(
-            f"build_activation_lut: unknown activation '{kind}' "
-            f"(supported: {sorted(_ACTIVATION_FNS)})"
+            f"build_activation_lut: unsupported activation target {target!r} "
+            f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})"
         )
-    f = _ACTIVATION_FNS[kind]
+    f = _ACTIVATION_FNS[target]
     lut = torch.empty(256, dtype=torch.int8)
     for q in range(-128, 128):
         x = (q - input_zp) * input_scale
diff --git a/backends/cortex_m/test/models/test_lstm_cell.py b/backends/cortex_m/test/models/test_lstm_cell.py
deleted file mode 100644
index c79574f955b..00000000000
--- a/backends/cortex_m/test/models/test_lstm_cell.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2026 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-"""Regression guard for the LSTMCell-gate lowering unblocker.
-
-`nn.LSTMCell` is exported as a single high-level op, so the quantizer never
-sees the gate sigmoids/tanhs and they end up unannotated in the edge graph.
-The lowering pass correctly skips them, but the user-facing effect is that
-Silero VAD's LSTM gates stay in aten even after the quantized_activation
-op lands.
-
-Unblocking this requires a pre-annotation decompose pass that splits the
-LSTMCell into linear + split + sigmoid + tanh + add + mul *before* the
-quantizer annotates. When that lands, the four sigmoids + one tanh inside
-this test's LSTMCell will lower to cortex_m.quantized_activation and this
-test will pass -- at which point the xfail can be removed and the Silero
-expectations updated.
-"""
-
-import pytest
-import torch
-
-from executorch.backends.cortex_m.test.tester import CortexMTester
-from executorch.backends.test.harness.stages import StageType
-from executorch.exir.dialects._ops import ops as exir_ops
-
-
-@pytest.mark.xfail(
-    reason="nn.LSTMCell is captured as a high-level op at export, so the "
-    "quantizer doesn't annotate the gate activations. Needs a "
-    "pre-annotation decompose pass to unblock.",
-    strict=True,
-)
-def test_lstm_cell_gates_lower():
-    hidden = 8
-    model = torch.nn.LSTMCell(hidden, hidden).eval()
-    x = torch.randn(1, hidden)
-    h = torch.zeros(1, hidden)
-    c = torch.zeros(1, hidden)
-
-    tester = CortexMTester(model, (x, (h, c)))
-    tester.quantize(None).export().to_edge().run_passes()
-
-    gm = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
-    quantized_activations = [
-        n
-        for n in gm.graph.nodes
-        if n.op == "call_function"
-        and n.target == exir_ops.edge.cortex_m.quantized_activation.default
-    ]
-    # An LSTMCell has 3 sigmoid gates (i, f, o) + 1 tanh gate (g) + 1 output
-    # tanh = 5 activation calls; all should lower once the decompose pass
-    # makes them visible to the quantizer.
-    assert len(quantized_activations) == 5, (
-        f"expected 5 quantized_activation nodes (3 sigmoid gates + 2 tanh), "
-        f"got {len(quantized_activations)}"
-    )
diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py
index 24e0294cf85..6ae82e1e70c 100644
--- a/backends/cortex_m/test/ops/test_activation_quant.py
+++ b/backends/cortex_m/test/ops/test_activation_quant.py
@@ -1,4 +1,4 @@
-# Copyright 2026 Arm Limited and/or its affiliates.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.