Skip to content

DML_OPERATOR_ACTIVATION_GELU fp16 produces incorrect output under non-US locale (e.g. German) #736

@yuslepukhin

Description

@yuslepukhin

Summary

When the process locale is set to German (German_Germany.1252), DirectML produces all zeros for a minimal fp16 Gelu model (ONNX opset 20). Under US English locale the same model produces correct results. The fp32 variant works correctly under both locales.

Environment

  • OS: Windows
  • GPU: [fill in your GPU]
  • Driver: [fill in your driver version]
  • DirectML.dll version: [from onnxruntime-directml package]
  • onnxruntime-directml: 1.21.0

Reproduction

Minimal script and ONNX model attached below. The model contains a single Gelu node (opset 20, approximate="none") with fp16 input/output.

python gelu_locale_repro.py compare --model gelu-fp16 --provider DmlExecutionProvider

Observed output (fp16 + DML + German locale):

{
  "bitwise_equal": false,
  "max_abs_diff": 10.0,
  "max_rel_diff": 1.0,
  "max_diff_flat_index": 256,
  "us_value_at_max": 10.0,
  "de_value_at_max": 0.0
}

Control cases (all pass):
fp32 + DML: bitwise equal under both locales ✓
fp16 + CPU: bitwise equal under both locales ✓

Analysis

ONNX Runtime's DML EP code has no locale-sensitive float formatting — all operator descriptors pass binary float values via DML_TENSOR_DESC structs. The opset-20 Gelu is decomposed into Erf/Mul/Add/Div primitives (since DML only registers contrib-domain Gelu), all of which DML claims for fp16 natively without ORT inserting casts.

The bug appears to be inside DirectML.dll's fp16 shader compilation or initialization path, where a locale-dependent float-to-string conversion (e.g., 0.5 → 0,5) may produce invalid HLSL or broken constants.

import numpy as np
import onnx
from onnx import helper, TensorProto

X = helper.make_tensor_value_info("x", TensorProto.FLOAT16, [1, 257])
Y = helper.make_tensor_value_info("y", TensorProto.FLOAT16, [1, 257])
node = helper.make_node("Gelu", ["x"], ["y"], approximate="none")
graph = helper.make_graph([node], "gelu_fp16", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
onnx.save(model, "gelu_opset20_float16.onnx")

Replace FLOAT16 with FLOAT for the fp32 control model.

Repro:

from __future__ import annotations

import argparse
import json
import locale
import subprocess
import sys
from pathlib import Path

import numpy as np
import onnx


SCRIPT_DIR = Path(__file__).resolve().parent
FP16_MODEL = SCRIPT_DIR / "gelu_opset20_float16.onnx"
FP32_MODEL = SCRIPT_DIR / "gelu_opset20_float32.onnx"
TPN_MODEL = SCRIPT_DIR / "tpn_unet_64x64.onnx"
MODEL_ALIASES = {
    "gelu-fp16": FP16_MODEL,
    "gelu-fp32": FP32_MODEL,
    "tpn": TPN_MODEL,
    "tpn-unet": TPN_MODEL,
}

LOCALE_CANDIDATES = {
    "us": (
        "English_United States.1252",
        "English_United States",
        "en-US",
        "en_US.UTF-8",
        "C",
    ),
    "de": (
        "German_Germany.1252",
        "German_Germany",
        "de-DE",
        "de_DE.UTF-8",
    ),
}


def resolve_model_path(model: str | Path) -> Path:
    model_text = str(model)
    return MODEL_ALIASES.get(model_text, Path(model_text)).resolve()


def make_gelu_input(dtype: np.dtype) -> np.ndarray:
    # Values around zero and in the tails make GELU implementation differences easy to see.
    values = np.concatenate(
        [
            np.linspace(-10.0, -1.0, 64, dtype=np.float32),
            np.linspace(-1.0, 1.0, 129, dtype=np.float32),
            np.linspace(1.0, 10.0, 64, dtype=np.float32),
        ]
    )
    return values.reshape(1, -1).astype(dtype)


def dtype_from_ort_type(type_name: str) -> np.dtype:
    if type_name == "tensor(float16)":
        return np.dtype(np.float16)
    if type_name == "tensor(float)":
        return np.dtype(np.float32)
    if type_name == "tensor(double)":
        return np.dtype(np.float64)
    if type_name == "tensor(int64)":
        return np.dtype(np.int64)
    if type_name == "tensor(int32)":
        return np.dtype(np.int32)
    raise ValueError(f"Unsupported input type: {type_name}")


def concrete_shape(shape: list[object]) -> tuple[int, ...]:
    return tuple(dim if isinstance(dim, int) and dim > 0 else 1 for dim in shape)


def deterministic_input(name: str, shape: tuple[int, ...], dtype: np.dtype) -> np.ndarray:
    if name == "x":
        return make_gelu_input(dtype)

    if np.issubdtype(dtype, np.integer):
        return np.zeros(shape, dtype=dtype)

    size = int(np.prod(shape))
    if name == "timestep":
        values = np.array([1.0], dtype=np.float32)
    elif name == "sample":
        values = np.linspace(-1.0, 1.0, size, dtype=np.float32)
    elif name == "encoder_hidden_states":
        values = np.linspace(-0.5, 0.5, size, dtype=np.float32)
    else:
        values = np.linspace(-1.0, 1.0, size, dtype=np.float32)

    return values.reshape(shape).astype(dtype)


def create_feeds(model_path: Path, session_inputs: list[object]) -> dict[str, np.ndarray]:
    model = onnx.load(model_path, load_external_data=False)
    initializer_names = {initializer.name for initializer in model.graph.initializer}
    feeds: dict[str, np.ndarray] = {}

    for session_input in session_inputs:
        if session_input.name in initializer_names:
            continue

        dtype = dtype_from_ort_type(session_input.type)
        shape = concrete_shape(session_input.shape)
        feeds[session_input.name] = deterministic_input(session_input.name, shape, dtype)

    return feeds


def set_requested_locale(locale_key: str) -> str:
    errors: list[str] = []
    for candidate in LOCALE_CANDIDATES[locale_key]:
        try:
            return locale.setlocale(locale.LC_ALL, candidate)
        except locale.Error as exc:
            errors.append(f"{candidate}: {exc}")
    raise RuntimeError(
        f"Could not set {locale_key!r} locale. Tried: " + "; ".join(errors)
    )


def save_outputs(output_path: Path, outputs: list[np.ndarray]) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    if len(outputs) == 1:
        np.save(output_path, outputs[0])
    else:
        np.savez(output_path, **{f"output_{i}": output for i, output in enumerate(outputs)})


def load_outputs(output_path: Path) -> list[np.ndarray]:
    loaded = np.load(output_path)
    if isinstance(loaded, np.lib.npyio.NpzFile):
        return [loaded[key] for key in sorted(loaded.files)]
    return [loaded]


def run_model(model_path: str | Path, provider: str, locale_key: str, output_path: Path) -> None:
    model_path = resolve_model_path(model_path)
    active_locale = set_requested_locale(locale_key)

    try:
        import onnxruntime as ort
    except ImportError as exc:
        raise RuntimeError(
            "onnxruntime is not installed. For DirectML, install onnxruntime-directml."
        ) from exc

    available = ort.get_available_providers()
    if provider not in available:
        raise RuntimeError(
            f"Provider {provider!r} is not available. Available providers: {available}"
        )

    session_options = ort.SessionOptions()
    session_options.enable_mem_pattern = False
    session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

    session = ort.InferenceSession(
        str(model_path),
        sess_options=session_options,
        providers=[provider],
    )
    feeds = create_feeds(model_path, session.get_inputs())
    outputs = session.run(None, feeds)
    save_outputs(output_path, outputs)

    metadata = {
        "active_locale": active_locale,
        "available_providers": available,
        "session_providers": session.get_providers(),
        "provider": provider,
        "model": str(model_path),
        "inputs": {
            name: {
                "dtype": str(value.dtype),
                "shape": list(value.shape),
            }
            for name, value in feeds.items()
        },
        "outputs": [
            {
                "dtype": str(output.dtype),
                "shape": list(output.shape),
            }
            for output in outputs
        ],
        "output_path": str(output_path),
    }
    output_path.with_suffix(".json").write_text(json.dumps(metadata, indent=2))
    print(json.dumps(metadata, indent=2))


def compare_outputs(us_path: Path, de_path: Path) -> None:
    us_outputs = load_outputs(us_path)
    de_outputs = load_outputs(de_path)
    if len(us_outputs) != len(de_outputs):
        raise ValueError(f"Output count mismatch: {len(us_outputs)} vs {len(de_outputs)}")

    us = np.concatenate([output.astype(np.float32).ravel() for output in us_outputs])
    de = np.concatenate([output.astype(np.float32).ravel() for output in de_outputs])
    diff = us - de
    abs_diff = np.abs(diff)
    max_index = int(abs_diff.argmax())
    max_abs = float(abs_diff[max_index])
    denom = np.maximum(np.abs(us), np.float32(1e-12))
    max_rel = float((abs_diff / denom).max())
    equal = all(np.array_equal(us_output, de_output) for us_output, de_output in zip(us_outputs, de_outputs))

    report = {
        "bitwise_equal": equal,
        "max_abs_diff": max_abs,
        "max_rel_diff": max_rel,
        "max_diff_flat_index": max_index,
        "us_value_at_max": float(us[max_index]),
        "de_value_at_max": float(de[max_index]),
    }
    print(json.dumps(report, indent=2))


def compare_locales(model_path: Path, provider: str) -> None:
    model_path = resolve_model_path(model_path)
    output_dir = SCRIPT_DIR / "outputs"
    output_dir.mkdir(parents=True, exist_ok=True)

    provider_name = provider.replace("ExecutionProvider", "").lower()
    us_path = output_dir / f"{model_path.stem}.{provider_name}.us.npy"
    de_path = output_dir / f"{model_path.stem}.{provider_name}.de.npy"

    for locale_key, output_path in (("us", us_path), ("de", de_path)):
        command = [
            sys.executable,
            str(Path(__file__).resolve()),
            "run",
            "--model",
            str(model_path),
            "--provider",
            provider,
            "--locale",
            locale_key,
            "--output",
            str(output_path),
        ]
        print("running:", " ".join(command))
        subprocess.run(command, check=True)

    compare_outputs(us_path, de_path)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run ONNX models under US and German locales and compare outputs."
    )
    subparsers = parser.add_subparsers(dest="command", required=True)

    run_parser = subparsers.add_parser("run", help="Run one model under one locale.")
    run_parser.add_argument("--model", required=True, help="Model path or alias: gelu-fp16, gelu-fp32, tpn")
    run_parser.add_argument("--provider", default="DmlExecutionProvider")
    run_parser.add_argument("--locale", choices=sorted(LOCALE_CANDIDATES), required=True)
    run_parser.add_argument("--output", type=Path, required=True)

    compare_parser = subparsers.add_parser(
        "compare", help="Run US and German locale child processes and compare outputs."
    )
    compare_parser.add_argument("--model", default="gelu-fp16", help="Model path or alias: gelu-fp16, gelu-fp32, tpn")
    compare_parser.add_argument("--provider", default="DmlExecutionProvider")

    return parser.parse_args()


def main() -> None:
    args = parse_args()
    if args.command == "run":
        run_model(args.model, args.provider, args.locale, args.output)
    elif args.command == "compare":
        compare_locales(args.model, args.provider)
    else:
        raise AssertionError(args.command)


if __name__ == "__main__":
    main()

gelu_opset20_float32.zip

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions