From b91a8ef93a77f6acd66788a2791e3dbe10547096 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Tue, 26 May 2026 16:13:50 -0700 Subject: [PATCH 1/2] Cortex-M backend: add quantized_activation op with LUT lowering for sigmoid/tanh/silu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CMSIS-NN has no s8 activation primitive — the s16 path requantizes around an on-target polynomial, which costs an extra s8 → s16 → activation → s8 trip per call. Instead this lowers standalone aten.sigmoid / aten.tanh / aten.silu to a single cortex_m.quantized_activation(input, lut) op backed by a 256-entry int8 LUT precomputed at AoT from the input/output qparams and the activation function. The kernel is a single byte-indexed lookup loop -- shape-agnostic, activation-agnostic, and free of any runtime requantization. Encoding the activation in the LUT bytes rather than a kind enum keeps the kernel surface to one op. For SiLU specifically, the LUT can encode `x * sigmoid(x)` directly, so the naive sigmoid-plus-elementwise-mul decomposition is unnecessary. aten.silu is added to the to_edge preserve_ops list so it doesn't decompose to sigmoid+mul before the lowering pass sees it; this is set globally because no per-test opt-out exists today. LUT-build numerics deliberately mirror the existing cortex_m CMSIS-NN conventions. Sigmoid/silu use a sign-branched stable form that always exponentiates a non-positive value, so the LUT build can't trip OverflowError for unusually wide input qparams. The final fp → int8 quantize uses round-half-away-from-zero, matching the rounding requantize_cmsis applies after its right-shift in passes_utils. In Silero VAD the final `sigmoid(final_conv(x))` now lowers; the 3 remaining sigmoids and 2 tanhs are LSTMCell gates and stay in aten because PyTorch export captures nn.LSTMCell as a single high-level op -- the quantizer never sees the gates and can't annotate them, and to_edge only decomposes the cell after the quantizer has run. test_lstm_cell.py captures the expected end-state as an xfail that will flip green once a pre-annotation decompose pass lands; that work is tracked as a separate follow-up. Other activations (GELU for KWT, Mish, ELU, Softplus) plug in as a few additional entries in passes_utils._ACTIVATION_FNS plus matching quantizer patterns. The generic op + LUT design carries them with no kernel changes. Co-authored-by: Claude --- backends/cortex_m/CMakeLists.txt | 1 + .../cortex_m/ops/op_quantized_activation.cpp | 53 ++++++ backends/cortex_m/ops/operators.py | 29 ++++ backends/cortex_m/ops/operators.yaml | 6 + .../passes/convert_to_cortex_m_pass.py | 50 +++++- backends/cortex_m/passes/passes_utils.py | 58 +++++++ .../cortex_m/quantizer/pattern_checkers.py | 19 +++ .../cortex_m/quantizer/quantizer_support.py | 8 + .../cortex_m/test/models/test_lstm_cell.py | 58 +++++++ .../cortex_m/test/models/test_silero_vad.py | 20 ++- .../test/ops/test_activation_quant.py | 152 ++++++++++++++++++ backends/cortex_m/test/tester.py | 8 + 12 files changed, 456 insertions(+), 6 deletions(-) create mode 100644 backends/cortex_m/ops/op_quantized_activation.cpp create mode 100644 backends/cortex_m/test/models/test_lstm_cell.py create mode 100644 backends/cortex_m/test/ops/test_activation_quant.py diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index 627406c1935..f88a6306fed 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -81,6 +81,7 @@ set(_cortex_m_kernels__srcs ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_activation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_avg_pool2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_batch_matmul.cpp diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp new file mode 100644 index 00000000000..454dc76bc8e --- /dev/null +++ b/backends/cortex_m/ops/op_quantized_activation.cpp @@ -0,0 +1,53 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cortex_m_ops_common.h" + +namespace cortex_m { +namespace native { + +// cppcheck-suppress unusedFunction +Tensor& quantized_activation_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& lut, + Tensor& out) { + ET_CHECK_MSG( + input.scalar_type() == ScalarType::Char, + "quantized_activation: input must be int8"); + ET_CHECK_MSG( + out.scalar_type() == ScalarType::Char, + "quantized_activation: output must be int8"); + ET_CHECK_MSG( + lut.scalar_type() == ScalarType::Char, + "quantized_activation: lut must be int8"); + ET_CHECK_MSG( + lut.numel() == 256, + "quantized_activation: lut must have 256 entries, got %" PRId64, + static_cast(lut.numel())); + ET_CHECK_MSG( + input.numel() == out.numel(), + "quantized_activation: input and output must have the same numel"); + + const int8_t* in_data = input.const_data_ptr(); + const int8_t* lut_data = lut.const_data_ptr(); + int8_t* out_data = out.mutable_data_ptr(); + + // Bias the signed int8 input by 128 to use it as an unsigned table index; + // the LUT entries are precomputed AoT from the input/output qparams and the + // activation function (sigmoid / tanh / silu / ...), so the kernel does not + // need to know which activation it is implementing. + const int64_t n = input.numel(); + for (int64_t i = 0; i < n; ++i) { + out_data[i] = lut_data[static_cast(in_data[i] + 128)]; + } + + return out; +} + +} // namespace native +} // namespace cortex_m diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index d4393bc7ada..4c6fb44e89d 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -264,6 +264,35 @@ def quantized_mul_impl( return result +# =================================================================== +# QUANTIZED ACTIVATION (LUT) OPERATION DEFINITION +# =================================================================== +# Generic table-lookup activation. The 256-entry int8 LUT is precomputed AoT +# from the input/output qparams and the activation function (sigmoid, tanh, +# silu, ...), so the kernel is identical regardless of which activation it +# evaluates: out[i] = lut[input[i] + 128]. +lib.define("quantized_activation(Tensor input, Tensor lut) -> Tensor") +lib.define( + "quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)" +) + + +@register_fake("cortex_m::quantized_activation") # type: ignore[misc] +def quantized_activation_meta(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor: + assert input.dtype == torch.int8, "quantized_activation input must be int8" + assert lut.dtype == torch.int8 and lut.numel() == 256, ( + "quantized_activation lut must be int8 with 256 entries; " + f"got dtype={lut.dtype}, numel={lut.numel()}" + ) + return torch.empty_like(input) + + +@impl(lib, "quantized_activation", "CompositeExplicitAutograd") # type: ignore[misc] +def quantized_activation_impl(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor: + indices = input.to(torch.int32) + 128 + return lut[indices].to(torch.int8) + + # =================================================================== # QUANTIZED BATCH MATMUL OPERATION DEFINITION # =================================================================== diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index 8db109dea43..8eacf2f49b9 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -29,6 +29,12 @@ - arg_meta: null kernel_name: cortex_m::quantized_mul_out +- func: cortex_m::quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: cortex_m::quantized_activation_out + - func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 5704645caf8..81cb5498217 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -13,7 +13,10 @@ from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass -from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot +from executorch.backends.cortex_m.passes.passes_utils import ( + build_activation_lut, + quantize_multiplier_aot, +) from executorch.backends.cortex_m.passes.scratch_buffer_sizes import ( required_cmsis_nn_buffer_sizes, ) @@ -483,6 +486,45 @@ def _get_bmm_replacement(self, node): ) return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args + _ACTIVATION_KINDS = { + exir_ops.edge.aten.sigmoid.default: "sigmoid", + exir_ops.edge.aten.tanh.default: "tanh", + exir_ops.edge.aten.silu.default: "silu", + } + + def _get_activation_replacement(self, node): + """Lower a standalone quantized sigmoid / tanh / silu to a single + cortex_m.quantized_activation call backed by an AoT-built 256-entry + int8 LUT. The kernel is shape-agnostic; the LUT encodes both the + activation function and the input/output qparams. + """ + input_qparams = node.meta["input_qparams"][0] + output_qparams = node.meta["output_qparams"][0] + kind = self._ACTIVATION_KINDS[node.target] + lut_tensor = build_activation_lut( + kind, + float(input_qparams.scale), + int(input_qparams.zp), + float(output_qparams.scale), + int(output_qparams.zp), + ) + + # Constant placeholders must appear before user-input placeholders; + # anchor on the first existing placeholder so the new LUT lands in the + # constant-placeholder block at the top of the graph. + first_placeholder = next(n for n in node.graph.nodes if n.op == "placeholder") + with node.graph.inserting_before(first_placeholder): + lut_node = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_lut", + InputKind.PARAMETER, + lut_tensor, + ) + + new_args = (node.args[0], lut_node) + return exir_ops.edge.cortex_m.quantized_activation.default, new_args + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: modified = False for node in graph_module.graph.nodes: @@ -506,6 +548,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: op, args = self._get_convolution_replacement(node) case exir_ops.edge.aten.bmm.default: op, args = self._get_bmm_replacement(node) + case ( + exir_ops.edge.aten.sigmoid.default + | exir_ops.edge.aten.tanh.default + | exir_ops.edge.aten.silu.default + ): + op, args = self._get_activation_replacement(node) case _: continue diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py index fcbfa301b06..64169e07521 100644 --- a/backends/cortex_m/passes/passes_utils.py +++ b/backends/cortex_m/passes/passes_utils.py @@ -190,6 +190,64 @@ def is_qualified_int8_node(args) -> bool: return False +def _stable_sigmoid(x: float) -> float: + # Always exponentiate the non-positive value so `math.exp` never overflows + # for unusually large `|x|` (e.g. wide-range input qparams). Algebraically + # identical to `1 / (1 + exp(-x))`. + if x >= 0: + return 1.0 / (1.0 + math.exp(-x)) + e = math.exp(x) + return e / (1.0 + e) + + +def _stable_silu(x: float) -> float: + return x * _stable_sigmoid(x) + + +_ACTIVATION_FNS = { + "sigmoid": _stable_sigmoid, + "tanh": math.tanh, + "silu": _stable_silu, +} + + +def _round_half_away_from_zero(x: float) -> int: + # Matches the rounding convention `requantize_cmsis` (above) applies after + # the right-shift step: ties on positive values round toward +∞, ties on + # negative values round toward -∞. Python's built-in `round` would use + # banker's rounding instead and disagree at exact half-integers. + return int(math.copysign(math.floor(abs(x) + 0.5), x)) if x != 0 else 0 + + +def build_activation_lut( + kind: str, + input_scale: float, + input_zp: int, + output_scale: float, + output_zp: int, +) -> torch.Tensor: + """AoT-compute a 256-entry int8 lookup table for a quantized activation. + + The LUT is indexed by the input byte value biased by 128: for any int8 + input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output. + Because the LUT is computed in float and quantized once per entry, the + runtime kernel is a single memory-lookup with no requantization math. + """ + if kind not in _ACTIVATION_FNS: + raise ValueError( + f"build_activation_lut: unknown activation '{kind}' " + f"(supported: {sorted(_ACTIVATION_FNS)})" + ) + f = _ACTIVATION_FNS[kind] + lut = torch.empty(256, dtype=torch.int8) + for q in range(-128, 128): + x = (q - input_zp) * input_scale + y = f(x) + q_out = _round_half_away_from_zero(y / output_scale + output_zp) + lut[q + 128] = max(-128, min(127, q_out)) + return lut + + def quantize_multiplier_aot(scale: float) -> tuple[int, int]: if scale == 0.0: return 0, 0 diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py index 860d8345607..5715ca042de 100644 --- a/backends/cortex_m/quantizer/pattern_checkers.py +++ b/backends/cortex_m/quantizer/pattern_checkers.py @@ -99,6 +99,25 @@ def check_quantization_config( return is_int8 +class CortexMActivationCheck(PatternCheck): + """Accept standalone elementwise activations (sigmoid / tanh / silu) + that the LUT-based cortex_m.quantized_activation op handles uniformly. + + The kernel is shape-agnostic and the LUT is computed AoT from per-tensor + qparams, so the only thing to enforce is int8 per-tensor quantization. + """ + + @classmethod + def check_quantization_config( + cls, pattern: list[Node], quantization_config: QuantizationConfig + ) -> bool: + is_int8 = cls.is_int8_activations(quantization_config) + is_per_tensor = cls.is_per_tensor( + quantization_config.get_input_act_qspec() + ) and cls.is_per_tensor(quantization_config.get_output_act_qspec()) + return is_int8 and is_per_tensor + + class CortexMSoftmaxCheck(PatternCheck): @classmethod diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py index 3dfbb67638a..317189a5f3e 100644 --- a/backends/cortex_m/quantizer/quantizer_support.py +++ b/backends/cortex_m/quantizer/quantizer_support.py @@ -5,6 +5,7 @@ import torch from executorch.backends.cortex_m.quantizer.pattern_checkers import ( + CortexMActivationCheck, CortexMAddMulCheck, CortexMAvgPool2DCheck, CortexMBmmCheck, @@ -119,6 +120,12 @@ (torch.ops.aten.softmax.int,): CortexMSoftmaxCheck, } +ACTIVATION_OP_PATTERNS = { + (torch.ops.aten.sigmoid.default,): CortexMActivationCheck, + (torch.ops.aten.tanh.default,): CortexMActivationCheck, + (torch.ops.aten.silu.default,): CortexMActivationCheck, +} + POOL_OP_PATTERNS = { (torch.ops.aten.avg_pool2d.default,): CortexMAvgPool2DCheck, (torch.ops.aten.max_pool2d.default,): CortexMMaxPool2DCheck, @@ -161,4 +168,5 @@ | CONV_TRANSPOSE_OP_PATTERNS | POOL_OP_PATTERNS | BMM_OP_PATTERNS + | ACTIVATION_OP_PATTERNS ) diff --git a/backends/cortex_m/test/models/test_lstm_cell.py b/backends/cortex_m/test/models/test_lstm_cell.py new file mode 100644 index 00000000000..c79574f955b --- /dev/null +++ b/backends/cortex_m/test/models/test_lstm_cell.py @@ -0,0 +1,58 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +"""Regression guard for the LSTMCell-gate lowering unblocker. + +`nn.LSTMCell` is exported as a single high-level op, so the quantizer never +sees the gate sigmoids/tanhs and they end up unannotated in the edge graph. +The lowering pass correctly skips them, but the user-facing effect is that +Silero VAD's LSTM gates stay in aten even after the quantized_activation +op lands. + +Unblocking this requires a pre-annotation decompose pass that splits the +LSTMCell into linear + split + sigmoid + tanh + add + mul *before* the +quantizer annotates. When that lands, the four sigmoids + one tanh inside +this test's LSTMCell will lower to cortex_m.quantized_activation and this +test will pass -- at which point the xfail can be removed and the Silero +expectations updated. +""" + +import pytest +import torch + +from executorch.backends.cortex_m.test.tester import CortexMTester +from executorch.backends.test.harness.stages import StageType +from executorch.exir.dialects._ops import ops as exir_ops + + +@pytest.mark.xfail( + reason="nn.LSTMCell is captured as a high-level op at export, so the " + "quantizer doesn't annotate the gate activations. Needs a " + "pre-annotation decompose pass to unblock.", + strict=True, +) +def test_lstm_cell_gates_lower(): + hidden = 8 + model = torch.nn.LSTMCell(hidden, hidden).eval() + x = torch.randn(1, hidden) + h = torch.zeros(1, hidden) + c = torch.zeros(1, hidden) + + tester = CortexMTester(model, (x, (h, c))) + tester.quantize(None).export().to_edge().run_passes() + + gm = tester.get_artifact(StageType.RUN_PASSES).exported_program().module() + quantized_activations = [ + n + for n in gm.graph.nodes + if n.op == "call_function" + and n.target == exir_ops.edge.cortex_m.quantized_activation.default + ] + # An LSTMCell has 3 sigmoid gates (i, f, o) + 1 tanh gate (g) + 1 output + # tanh = 5 activation calls; all should lower once the decompose pass + # makes them visible to the quantizer. + assert len(quantized_activations) == 5, ( + f"expected 5 quantized_activation nodes (3 sigmoid gates + 2 tanh), " + f"got {len(quantized_activations)}" + ) diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py index 27b958627bb..9793f94f2c6 100644 --- a/backends/cortex_m/test/models/test_silero_vad.py +++ b/backends/cortex_m/test/models/test_silero_vad.py @@ -36,9 +36,18 @@ "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12, - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 15, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14, } +# The final `sigmoid(final_conv(x))` now lowers to cortex_m.quantized_activation. +# The 3 remaining sigmoids and 2 tanhs are LSTMCell gates: PyTorch export +# captures nn.LSTMCell as a single high-level op, so the quantizer never sees +# the gate activations and can't annotate them. They're decomposed only at +# to_edge -- which runs after the quantizer, so by then the gates have no +# qparams to fold and the lowering pass correctly skips them. The unblocker +# is a pre-annotation decompose pass that splits nn.LSTMCell into linear + +# split + sigmoid + tanh + add + mul *before* prepare_pt2e runs; tracked as +# the LSTMCell verification follow-up. ops_after_transforms: dict[str, int] = { "executorch_exir_dialects_edge__ops_aten_abs_default": 2, "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2, @@ -52,7 +61,7 @@ "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2, "executorch_exir_dialects_edge__ops_aten_relu_default": 5, "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2, - "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 3, "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, @@ -61,8 +70,9 @@ "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6, - "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 7, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 7, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1, "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, } diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py new file mode 100644 index 00000000000..24e0294cf85 --- /dev/null +++ b/backends/cortex_m/test/ops/test_activation_quant.py @@ -0,0 +1,152 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) + + +# A single per-op `ops_after_transforms` shape is enough: every supported +# activation lowers to exactly one cortex_m.quantized_activation, with the +# AoT LUT stored as a constant placeholder and a single quant/dequant pair +# at the graph boundary. +_OPS_BEFORE = { + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, +} +_OPS_AFTER = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, +} + + +class _Sigmoid(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.sigmoid(x) + + +class _Tanh(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_tanh_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.tanh(x) + + +class _SiLU(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_silu_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.nn.functional.silu(x) + + +import torch as _torch + + +def _zero_input(shape): + return _torch.zeros(shape, dtype=_torch.float32) + + +# Wide-magnitude inputs exercise the `max(-128, min(127, q_out))` clamp inside +# build_activation_lut; shifted-ramp inputs push the quantizer to pick a +# non-zero `input_zp`, exercising the `(q - input_zp) * input_scale` term in +# the LUT formula; all-zero inputs pin down the lut entry at `input_zp + 128`. +test_cases = { + "sigmoid_rank1": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-6, 6, (16,)),), + ), + "sigmoid_rank4": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),), + ), + "sigmoid_saturating": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-50, 50, (32,)),), + ), + "sigmoid_asymmetric_zp": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-1, 9, (16,)),), + ), + "sigmoid_zero": McuTestCase( + model=_Sigmoid(), + example_inputs=(_zero_input((16,)),), + ), + "tanh_rank1": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-3, 3, (16,)),), + ), + "tanh_rank3": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-2, 2, (1, 4, 16)),), + ), + "tanh_saturating": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-30, 30, (32,)),), + ), + "tanh_asymmetric_zp": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-1, 5, (16,)),), + ), + "tanh_zero": McuTestCase( + model=_Tanh(), + example_inputs=(_zero_input((16,)),), + ), + "silu_rank1": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-6, 6, (16,)),), + ), + "silu_rank4": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),), + ), + "silu_saturating": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-50, 50, (32,)),), + ), + "silu_asymmetric_zp": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-1, 9, (16,)),), + ), + "silu_zero": McuTestCase( + model=_SiLU(), + example_inputs=(_zero_input((16,)),), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_quantized_activation(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, + test_case.model.ops_after_transforms, + qtol=1, + ) + + +@parametrize("test_case", test_cases) +def test_implementation_quantized_activation(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation(qtol=1) diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py index e9912d03cad..5a56ad62e92 100644 --- a/backends/cortex_m/test/tester.py +++ b/backends/cortex_m/test/tester.py @@ -42,6 +42,14 @@ def __init__(self): torch.ops.aten.hardsigmoid_.default, torch.ops.aten.hardswish.default, torch.ops.aten.hardswish_.default, + # silu naturally decomposes to sigmoid*x at the to_edge step. + # Preserve it so the LUT lowering can collapse it into a single + # cortex_m.quantized_activation call rather than emitting an + # extra elementwise mul. Set globally because no per-test + # opt-out exists today; any new cortex_m test that uses SiLU + # must therefore expect a single aten.silu op in the edge graph + # (not sigmoid+mul). + torch.ops.aten.silu.default, ], _check_ir_validity=False, _core_aten_ops_exception_list=[torch.ops.aten.max_pool2d.default], From 5045ac20c86e06668c2eef8c6cce4840ef48808e Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Wed, 27 May 2026 14:00:55 -0700 Subject: [PATCH 2/2] Cortex-M backend: address review feedback on quantized_activation Adrian's three review comments on #19792, plus SIMD acceleration of the LUT lookup (his comment asked for vector intrinsics and loop unrolling): * Drop the target -> string indirection in the activation lowering. `passes_utils._ACTIVATION_FNS` now keys directly on the edge op target (`exir_ops.edge.aten.{sigmoid,tanh,silu}.default`), and `ConvertToCortexMPass._get_activation_replacement` passes `node.target` straight into `build_activation_lut` -- no `_ACTIVATION_KINDS` dict and no string round-trip. * Replace the scalar LUT-lookup loop with three compile-gated paths: - M55/M85 (MVE): 16 lanes per iteration -- `vldrbq_u8` load, `vaddq_n_u8` to bias by 128, `vldrbq_gather_offset_s8` to gather the LUT result, `vstrbq_s8` to store. - M4/M7 (DSP, no MVE): 4 bytes per iteration -- fold four byte-loads into one word-load, batch the +128 bias with `__uadd8`, four LUT lookups (no M-class gather instruction exists), fold four byte-stores into one word-store. Uses `` and local memcpy helpers rather than pulling in the heavyweight `arm_nnsupportfunctions.h`. - All other cores (M0+/M3): a 4x-unrolled scalar tail, which also handles the sub-vector remainder of the two SIMD paths. * Switch the source header to Meta's standard copyright block to match the other cortex_m op files. Also drop test_lstm_cell.py: the LSTMCell gates can't lower until a pre-annotation decompose pass lands, so the test isn't ready and is removed until that follow-up work is done. The MVE path is verified on the Corstone-300 FVP (cortex-m55) via the existing test_implementation_quantized_activation suite. The three paths were cross-compiled for cortex-m0plus / m4 / m7 / m55; the M4 build emits `uadd8` and the M55 build emits the MVE gather. Co-authored-by: Claude --- .../cortex_m/ops/op_quantized_activation.cpp | 88 +++++++++++++++++-- .../passes/convert_to_cortex_m_pass.py | 9 +- backends/cortex_m/passes/passes_utils.py | 19 ++-- .../cortex_m/test/models/test_lstm_cell.py | 58 ------------ .../test/ops/test_activation_quant.py | 2 +- 5 files changed, 96 insertions(+), 80 deletions(-) delete mode 100644 backends/cortex_m/test/models/test_lstm_cell.py diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp index 454dc76bc8e..fb9b4768acf 100644 --- a/backends/cortex_m/ops/op_quantized_activation.cpp +++ b/backends/cortex_m/ops/op_quantized_activation.cpp @@ -1,5 +1,6 @@ /* - * Copyright 2026 Arm Limited and/or its affiliates. + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -7,9 +8,39 @@ #include "cortex_m_ops_common.h" +#include + +#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1) +#include +#define HAS_HELIUM_SIMD 1 +#endif + +#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD) +#include +#define HAS_DSP_PACKED_LUT 1 +#endif + namespace cortex_m { namespace native { +#if defined(HAS_DSP_PACKED_LUT) +// Local 4-byte read/write helpers. We deliberately don't include +// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia` +// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire +// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers. +static inline uint32_t read_u8x4_ia(const int8_t** in) { + uint32_t val; + std::memcpy(&val, *in, 4); + *in += 4; + return val; +} + +static inline void write_u8x4_ia(int8_t** out, uint32_t val) { + std::memcpy(*out, &val, 4); + *out += 4; +} +#endif + // cppcheck-suppress unusedFunction Tensor& quantized_activation_out( KernelRuntimeContext& context, @@ -37,12 +68,59 @@ Tensor& quantized_activation_out( const int8_t* lut_data = lut.const_data_ptr(); int8_t* out_data = out.mutable_data_ptr(); - // Bias the signed int8 input by 128 to use it as an unsigned table index; - // the LUT entries are precomputed AoT from the input/output qparams and the + // The LUT is precomputed AoT from the input/output qparams and the // activation function (sigmoid / tanh / silu / ...), so the kernel does not - // need to know which activation it is implementing. + // need to know which activation it is implementing. The signed int8 input + // is biased by 128 to use it as an unsigned [0, 255] table index. const int64_t n = input.numel(); - for (int64_t i = 0; i < n; ++i) { + int64_t i = 0; + +#if defined(HAS_HELIUM_SIMD) + // M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8 + // (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then + // gather-load the int8 result from the LUT. + for (; i + 15 < n; i += 16) { + uint8x16_t in_u8 = + vldrbq_u8(reinterpret_cast(in_data + i)); + uint8x16_t idx = vaddq_n_u8(in_u8, 128); + int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx); + vstrbq_s8(out_data + i, result); + } +#elif defined(HAS_DSP_PACKED_LUT) + // M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from + // (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias + // with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The + // LUT lookups themselves still hit memory four times per word -- no DSP + // gather instruction exists on M-class. + const int8_t* in_ptr = in_data; + int8_t* out_ptr = out_data; + const int64_t word_iters = n >> 2; + for (int64_t w = 0; w < word_iters; ++w) { + const uint32_t in_word = read_u8x4_ia(&in_ptr); + const uint32_t idx_word = __uadd8(in_word, 0x80808080u); + const uint32_t out_word = + static_cast(static_cast(lut_data[idx_word & 0xFFu])) | + (static_cast(static_cast(lut_data[(idx_word >> 8) & 0xFFu])) + << 8) | + (static_cast(static_cast(lut_data[(idx_word >> 16) & 0xFFu])) + << 16) | + (static_cast(static_cast(lut_data[(idx_word >> 24) & 0xFFu])) + << 24); + write_u8x4_ia(&out_ptr, out_word); + } + i = word_iters << 2; +#endif + + // 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll + // lets the compiler issue independent LUT loads; on the MVE / DSP paths + // above this only runs for the < 16- (or < 4-) element remainder. + for (; i + 3 < n; i += 4) { + out_data[i + 0] = lut_data[static_cast(in_data[i + 0] + 128)]; + out_data[i + 1] = lut_data[static_cast(in_data[i + 1] + 128)]; + out_data[i + 2] = lut_data[static_cast(in_data[i + 2] + 128)]; + out_data[i + 3] = lut_data[static_cast(in_data[i + 3] + 128)]; + } + for (; i < n; ++i) { out_data[i] = lut_data[static_cast(in_data[i] + 128)]; } diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 81cb5498217..24cc85bac66 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -486,12 +486,6 @@ def _get_bmm_replacement(self, node): ) return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args - _ACTIVATION_KINDS = { - exir_ops.edge.aten.sigmoid.default: "sigmoid", - exir_ops.edge.aten.tanh.default: "tanh", - exir_ops.edge.aten.silu.default: "silu", - } - def _get_activation_replacement(self, node): """Lower a standalone quantized sigmoid / tanh / silu to a single cortex_m.quantized_activation call backed by an AoT-built 256-entry @@ -500,9 +494,8 @@ def _get_activation_replacement(self, node): """ input_qparams = node.meta["input_qparams"][0] output_qparams = node.meta["output_qparams"][0] - kind = self._ACTIVATION_KINDS[node.target] lut_tensor = build_activation_lut( - kind, + node.target, float(input_qparams.scale), int(input_qparams.zp), float(output_qparams.scale), diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py index 64169e07521..24e2da95dba 100644 --- a/backends/cortex_m/passes/passes_utils.py +++ b/backends/cortex_m/passes/passes_utils.py @@ -205,9 +205,9 @@ def _stable_silu(x: float) -> float: _ACTIVATION_FNS = { - "sigmoid": _stable_sigmoid, - "tanh": math.tanh, - "silu": _stable_silu, + exir_ops.edge.aten.sigmoid.default: _stable_sigmoid, + exir_ops.edge.aten.tanh.default: math.tanh, + exir_ops.edge.aten.silu.default: _stable_silu, } @@ -220,7 +220,7 @@ def _round_half_away_from_zero(x: float) -> int: def build_activation_lut( - kind: str, + target, input_scale: float, input_zp: int, output_scale: float, @@ -228,17 +228,20 @@ def build_activation_lut( ) -> torch.Tensor: """AoT-compute a 256-entry int8 lookup table for a quantized activation. + `target` is the edge-dialect op being lowered (e.g. + `exir_ops.edge.aten.sigmoid.default`). + The LUT is indexed by the input byte value biased by 128: for any int8 input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output. Because the LUT is computed in float and quantized once per entry, the runtime kernel is a single memory-lookup with no requantization math. """ - if kind not in _ACTIVATION_FNS: + if target not in _ACTIVATION_FNS: raise ValueError( - f"build_activation_lut: unknown activation '{kind}' " - f"(supported: {sorted(_ACTIVATION_FNS)})" + f"build_activation_lut: unsupported activation target {target!r} " + f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})" ) - f = _ACTIVATION_FNS[kind] + f = _ACTIVATION_FNS[target] lut = torch.empty(256, dtype=torch.int8) for q in range(-128, 128): x = (q - input_zp) * input_scale diff --git a/backends/cortex_m/test/models/test_lstm_cell.py b/backends/cortex_m/test/models/test_lstm_cell.py deleted file mode 100644 index c79574f955b..00000000000 --- a/backends/cortex_m/test/models/test_lstm_cell.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2026 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -"""Regression guard for the LSTMCell-gate lowering unblocker. - -`nn.LSTMCell` is exported as a single high-level op, so the quantizer never -sees the gate sigmoids/tanhs and they end up unannotated in the edge graph. -The lowering pass correctly skips them, but the user-facing effect is that -Silero VAD's LSTM gates stay in aten even after the quantized_activation -op lands. - -Unblocking this requires a pre-annotation decompose pass that splits the -LSTMCell into linear + split + sigmoid + tanh + add + mul *before* the -quantizer annotates. When that lands, the four sigmoids + one tanh inside -this test's LSTMCell will lower to cortex_m.quantized_activation and this -test will pass -- at which point the xfail can be removed and the Silero -expectations updated. -""" - -import pytest -import torch - -from executorch.backends.cortex_m.test.tester import CortexMTester -from executorch.backends.test.harness.stages import StageType -from executorch.exir.dialects._ops import ops as exir_ops - - -@pytest.mark.xfail( - reason="nn.LSTMCell is captured as a high-level op at export, so the " - "quantizer doesn't annotate the gate activations. Needs a " - "pre-annotation decompose pass to unblock.", - strict=True, -) -def test_lstm_cell_gates_lower(): - hidden = 8 - model = torch.nn.LSTMCell(hidden, hidden).eval() - x = torch.randn(1, hidden) - h = torch.zeros(1, hidden) - c = torch.zeros(1, hidden) - - tester = CortexMTester(model, (x, (h, c))) - tester.quantize(None).export().to_edge().run_passes() - - gm = tester.get_artifact(StageType.RUN_PASSES).exported_program().module() - quantized_activations = [ - n - for n in gm.graph.nodes - if n.op == "call_function" - and n.target == exir_ops.edge.cortex_m.quantized_activation.default - ] - # An LSTMCell has 3 sigmoid gates (i, f, o) + 1 tanh gate (g) + 1 output - # tanh = 5 activation calls; all should lower once the decompose pass - # makes them visible to the quantizer. - assert len(quantized_activations) == 5, ( - f"expected 5 quantized_activation nodes (3 sigmoid gates + 2 tanh), " - f"got {len(quantized_activations)}" - ) diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py index 24e0294cf85..6ae82e1e70c 100644 --- a/backends/cortex_m/test/ops/test_activation_quant.py +++ b/backends/cortex_m/test/ops/test_activation_quant.py @@ -1,4 +1,4 @@ -# Copyright 2026 Arm Limited and/or its affiliates. +# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree.