pytorch · JulianCloudNTH · Jun 2, 2026
diff --git a/.ci/scripts/setup-webgpu-linux-deps.sh b/.ci/scripts/setup-webgpu-linux-deps.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
+install_swiftshader() {
+  _https_amazon_aws=https://ossci-android.s3.amazonaws.com
+  _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
+  _swiftshader_dir=/tmp/swiftshader
+  mkdir -p $_swiftshader_dir
+
+  _tmp_archive="/tmp/${_swiftshader_archive}"
+
+  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
+    --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"
+
+  tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
+
+  export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
+  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/"
+  export ETVK_USING_SWIFTSHADER=1
+}
+
+install_swiftshader
+bash backends/webgpu/scripts/setup-wgpu-native.sh
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -57,6 +57,14 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *webgpu* ]]; then
+    # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
+    # which are required to build and run the WebGPU delegate.
+    source .ci/scripts/setup-webgpu-linux-deps.sh
+
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
     if [[ "$SUITE" == "operators" ]]; then
         PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)

diff --git a/.github/workflows/test-backend-webgpu.yml b/.github/workflows/test-backend-webgpu.yml
@@ -0,0 +1,27 @@
+name: Test WebGPU Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-webgpu:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: webgpu
+      flows: '["webgpu"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
@@ -1056,6 +1056,10 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs vulkan_backend)
   endif()
 
+  if(EXECUTORCH_BUILD_WEBGPU)
+    list(APPEND _dep_libs webgpu_backend)
+  endif()
+
   # compile options for pybind
   set(_pybind_compile_options
       $<$<CXX_COMPILER_ID:MSVC>:/EHsc

diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
@@ -117,6 +117,12 @@ def _load_vulkan() -> list[TestFlow]:
     return [VULKAN_TEST_FLOW, VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW]
 
 
+def _load_webgpu() -> list[TestFlow]:
+    from executorch.backends.test.suite.flows.webgpu import WEBGPU_TEST_FLOW
+
+    return [WEBGPU_TEST_FLOW]
+
+
 def _load_openvino() -> list[TestFlow]:
     from executorch.backends.test.suite.flows.openvino import (
         OPENVINO_INT8_TEST_FLOW,
@@ -178,6 +184,7 @@ def all_flows() -> dict[str, TestFlow]:
         + _register_flow(_load_xnnpack, "XNNPACK")
         + _register_flow(_load_coreml, "Core ML")
         + _register_flow(_load_vulkan, "Vulkan")
+        + _register_flow(_load_webgpu, "WebGPU")
         + _register_flow(_load_openvino, "OpenVINO")
         + _register_flow(_load_qnn, "QNN")
         + _register_flow(_load_arm, "ARM")

diff --git a/backends/test/suite/flows/webgpu.py b/backends/test/suite/flows/webgpu.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.webgpu.test.tester import WebGPUTester
+
+
+def _create_webgpu_flow() -> TestFlow:
+    return TestFlow(
+        "webgpu",
+        backend="webgpu",
+        tester_factory=WebGPUTester,
+        skip_patterns=["float16", "float64"],  # Not supported in swiftshader
+    )
+
+
+WEBGPU_TEST_FLOW = _create_webgpu_flow()
@@ -21,14 +21,18 @@ if(NOT TARGET vulkan_schema)
   # target), but vulkan_schema is unconditionally defined.
   add_subdirectory(
     ${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
-    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
+    ${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema EXCLUDE_FROM_ALL
   )
 endif()
 
 set(WEBGPU_SRCS
-    runtime/WebGPUBackend.cpp runtime/WebGPUGraph.cpp
-    runtime/WebGPUDelegateHeader.cpp runtime/WebGPUDevice.cpp
-    runtime/ops/OperatorRegistry.cpp runtime/ops/add/BinaryOp.cpp
+    runtime/WebGPUBackend.cpp
+    runtime/WebGPUGraph.cpp
+    runtime/WebGPUDelegateHeader.cpp
+    runtime/WebGPUDevice.cpp
+    runtime/ops/OperatorRegistry.cpp
+    runtime/ops/add/BinaryOp.cpp
+    runtime/ops/rms_norm/RmsNorm.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})

diff --git a/backends/webgpu/__init__.py b/backends/webgpu/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/runtime/WebGPUBackend.cpp b/backends/webgpu/runtime/WebGPUBackend.cpp
@@ -76,7 +76,7 @@ Result<DelegateHandle*> WebGPUBackend::init(
   }
 
   try {
-    graph->build(flatbuffer_data, constant_data);
+    graph->build(flatbuffer_data, constant_data, context.get_named_data_map());
   } catch (const std::exception& e) {
     ET_LOG(Error, "WebGPU graph build failed: %s", e.what());
     graph->~WebGPUGraph();

diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -121,7 +121,13 @@ WebGPUContext create_webgpu_context() {
   device_cb.callback = on_device_request;
   device_cb.userdata1 = &device_result;
 
+  // Request the adapter's full limits; software adapters default many to 0.
+  WGPULimits supported_limits = {};
   WGPUDeviceDescriptor device_desc = {};
+  if (wgpuAdapterGetLimits(ctx.adapter, &supported_limits) ==
+      WGPUStatus_Success) {
+    device_desc.requiredLimits = &supported_limits;
+  }
   device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
 
   wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -10,6 +10,7 @@
 #include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/serialization/schema_generated.h>
+#include <executorch/runtime/core/named_data_map.h>
 
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 #include <webgpu/wgpu.h>
@@ -93,7 +94,8 @@ WebGPUGraph::~WebGPUGraph() {
 
 void WebGPUGraph::build(
     const void* flatbuffer_data,
-    const uint8_t* constant_data) {
+    const uint8_t* constant_data,
+    const executorch::runtime::NamedDataMap* named_data_map) {
   if (!device_) {
     auto* ctx = get_default_webgpu_context();
     if (ctx) {
@@ -165,6 +167,25 @@ void WebGPUGraph::build(
                 const uint8_t* src = constant_data + vk_bytes->offset();
                 wgpuQueueWriteBuffer(
                     queue_, tensor.buffer, 0, src, tensor.nbytes);
+              } else if (
+                  vk_bytes->named_key() != nullptr &&
+                  named_data_map != nullptr) {
+                // Constant stored in the PTE named-data map.
+                auto buf =
+                    named_data_map->get_data(vk_bytes->named_key()->c_str());
+                if (buf.ok() && buf->size() >= tensor.nbytes) {
+                  wgpuQueueWriteBuffer(
+                      queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
+                  buf->Free();
+                } else {
+                  throw std::runtime_error(
+                      std::string("WebGPU: named constant '") +
+                      vk_bytes->named_key()->c_str() +
+                      "' missing or undersized in NamedDataMap");
+                }
+              } else {
+                throw std::runtime_error(
+                    "WebGPU: constant has no inline offset and no named-data key");
               }
             }
           }

diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -15,10 +15,14 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/runtime/core/named_data_map.h>
+
 namespace executorch {
 namespace backends {
 namespace webgpu {
 
+using executorch::runtime::NamedDataMap;
+
 struct WebGPUTensor {
   WGPUBuffer buffer = nullptr;
   std::vector<int64_t> dims;
@@ -66,7 +70,10 @@ class WebGPUGraph {
 
   // Build the graph from a deserialized VkGraph flatbuffer and constant data.
   // The flatbuffer_data pointer must remain valid during build().
-  void build(const void* flatbuffer_data, const uint8_t* constant_data);
+  void build(
+      const void* flatbuffer_data,
+      const uint8_t* constant_data,
+      const NamedDataMap* named_data_map = nullptr);
 
   // Copy input tensor data from host pointers into GPU buffers.
   void copy_inputs(const std::vector<std::pair<const void*, size_t>>& inputs);

diff --git a/backends/webgpu/runtime/ops/add/binary_add.wgsl b/backends/webgpu/runtime/ops/add/binary_add.wgsl
@@ -8,7 +8,7 @@ struct Params {
 }
 @group(0) @binding(3) var<uniform> params: Params;
 
-@compute @workgroup_size(256)
+@compute @workgroup_size(64)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let idx = gid.x;
     if (idx >= params.num_elements) {

diff --git a/backends/webgpu/runtime/ops/add/binary_add_wgsl.h b/backends/webgpu/runtime/ops/add/binary_add_wgsl.h
@@ -24,7 +24,7 @@ struct Params {
 }
 @group(0) @binding(3) var<uniform> params: Params;
 
-@compute @workgroup_size(256)
+@compute @workgroup_size(64)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let idx = gid.x;
     if (idx >= params.num_elements) {
@@ -34,7 +34,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 }
 )";
 
-inline constexpr uint32_t kBinaryAddWorkgroupSize = 256;
+inline constexpr uint32_t kBinaryAddWorkgroupSize = 64;
 
 } // namespace webgpu
 } // namespace backends