Summary
When the process locale is set to German (German_Germany.1252), DirectML produces all zeros for a minimal fp16 Gelu model (ONNX opset 20). Under US English locale the same model produces correct results. The fp32 variant works correctly under both locales.
Environment
- OS: Windows
- GPU: [fill in your GPU]
- Driver: [fill in your driver version]
- DirectML.dll version: [from onnxruntime-directml package]
- onnxruntime-directml: 1.21.0
Reproduction
Minimal script and ONNX model attached below. The model contains a single Gelu node (opset 20, approximate="none") with fp16 input/output.
python gelu_locale_repro.py compare --model gelu-fp16 --provider DmlExecutionProvider
Observed output (fp16 + DML + German locale):
{
"bitwise_equal": false,
"max_abs_diff": 10.0,
"max_rel_diff": 1.0,
"max_diff_flat_index": 256,
"us_value_at_max": 10.0,
"de_value_at_max": 0.0
}
Control cases (all pass):
fp32 + DML: bitwise equal under both locales ✓
fp16 + CPU: bitwise equal under both locales ✓
Analysis
ONNX Runtime's DML EP code has no locale-sensitive float formatting — all operator descriptors pass binary float values via DML_TENSOR_DESC structs. The opset-20 Gelu is decomposed into Erf/Mul/Add/Div primitives (since DML only registers contrib-domain Gelu), all of which DML claims for fp16 natively without ORT inserting casts.
The bug appears to be inside DirectML.dll's fp16 shader compilation or initialization path, where a locale-dependent float-to-string conversion (e.g., 0.5 → 0,5) may produce invalid HLSL or broken constants.
import numpy as np
import onnx
from onnx import helper, TensorProto
X = helper.make_tensor_value_info("x", TensorProto.FLOAT16, [1, 257])
Y = helper.make_tensor_value_info("y", TensorProto.FLOAT16, [1, 257])
node = helper.make_node("Gelu", ["x"], ["y"], approximate="none")
graph = helper.make_graph([node], "gelu_fp16", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 20)])
onnx.save(model, "gelu_opset20_float16.onnx")
Replace FLOAT16 with FLOAT for the fp32 control model.
Repro:
from __future__ import annotations
import argparse
import json
import locale
import subprocess
import sys
from pathlib import Path
import numpy as np
import onnx
SCRIPT_DIR = Path(__file__).resolve().parent
FP16_MODEL = SCRIPT_DIR / "gelu_opset20_float16.onnx"
FP32_MODEL = SCRIPT_DIR / "gelu_opset20_float32.onnx"
TPN_MODEL = SCRIPT_DIR / "tpn_unet_64x64.onnx"
MODEL_ALIASES = {
"gelu-fp16": FP16_MODEL,
"gelu-fp32": FP32_MODEL,
"tpn": TPN_MODEL,
"tpn-unet": TPN_MODEL,
}
LOCALE_CANDIDATES = {
"us": (
"English_United States.1252",
"English_United States",
"en-US",
"en_US.UTF-8",
"C",
),
"de": (
"German_Germany.1252",
"German_Germany",
"de-DE",
"de_DE.UTF-8",
),
}
def resolve_model_path(model: str | Path) -> Path:
model_text = str(model)
return MODEL_ALIASES.get(model_text, Path(model_text)).resolve()
def make_gelu_input(dtype: np.dtype) -> np.ndarray:
# Values around zero and in the tails make GELU implementation differences easy to see.
values = np.concatenate(
[
np.linspace(-10.0, -1.0, 64, dtype=np.float32),
np.linspace(-1.0, 1.0, 129, dtype=np.float32),
np.linspace(1.0, 10.0, 64, dtype=np.float32),
]
)
return values.reshape(1, -1).astype(dtype)
def dtype_from_ort_type(type_name: str) -> np.dtype:
if type_name == "tensor(float16)":
return np.dtype(np.float16)
if type_name == "tensor(float)":
return np.dtype(np.float32)
if type_name == "tensor(double)":
return np.dtype(np.float64)
if type_name == "tensor(int64)":
return np.dtype(np.int64)
if type_name == "tensor(int32)":
return np.dtype(np.int32)
raise ValueError(f"Unsupported input type: {type_name}")
def concrete_shape(shape: list[object]) -> tuple[int, ...]:
return tuple(dim if isinstance(dim, int) and dim > 0 else 1 for dim in shape)
def deterministic_input(name: str, shape: tuple[int, ...], dtype: np.dtype) -> np.ndarray:
if name == "x":
return make_gelu_input(dtype)
if np.issubdtype(dtype, np.integer):
return np.zeros(shape, dtype=dtype)
size = int(np.prod(shape))
if name == "timestep":
values = np.array([1.0], dtype=np.float32)
elif name == "sample":
values = np.linspace(-1.0, 1.0, size, dtype=np.float32)
elif name == "encoder_hidden_states":
values = np.linspace(-0.5, 0.5, size, dtype=np.float32)
else:
values = np.linspace(-1.0, 1.0, size, dtype=np.float32)
return values.reshape(shape).astype(dtype)
def create_feeds(model_path: Path, session_inputs: list[object]) -> dict[str, np.ndarray]:
model = onnx.load(model_path, load_external_data=False)
initializer_names = {initializer.name for initializer in model.graph.initializer}
feeds: dict[str, np.ndarray] = {}
for session_input in session_inputs:
if session_input.name in initializer_names:
continue
dtype = dtype_from_ort_type(session_input.type)
shape = concrete_shape(session_input.shape)
feeds[session_input.name] = deterministic_input(session_input.name, shape, dtype)
return feeds
def set_requested_locale(locale_key: str) -> str:
errors: list[str] = []
for candidate in LOCALE_CANDIDATES[locale_key]:
try:
return locale.setlocale(locale.LC_ALL, candidate)
except locale.Error as exc:
errors.append(f"{candidate}: {exc}")
raise RuntimeError(
f"Could not set {locale_key!r} locale. Tried: " + "; ".join(errors)
)
def save_outputs(output_path: Path, outputs: list[np.ndarray]) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
if len(outputs) == 1:
np.save(output_path, outputs[0])
else:
np.savez(output_path, **{f"output_{i}": output for i, output in enumerate(outputs)})
def load_outputs(output_path: Path) -> list[np.ndarray]:
loaded = np.load(output_path)
if isinstance(loaded, np.lib.npyio.NpzFile):
return [loaded[key] for key in sorted(loaded.files)]
return [loaded]
def run_model(model_path: str | Path, provider: str, locale_key: str, output_path: Path) -> None:
model_path = resolve_model_path(model_path)
active_locale = set_requested_locale(locale_key)
try:
import onnxruntime as ort
except ImportError as exc:
raise RuntimeError(
"onnxruntime is not installed. For DirectML, install onnxruntime-directml."
) from exc
available = ort.get_available_providers()
if provider not in available:
raise RuntimeError(
f"Provider {provider!r} is not available. Available providers: {available}"
)
session_options = ort.SessionOptions()
session_options.enable_mem_pattern = False
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
session = ort.InferenceSession(
str(model_path),
sess_options=session_options,
providers=[provider],
)
feeds = create_feeds(model_path, session.get_inputs())
outputs = session.run(None, feeds)
save_outputs(output_path, outputs)
metadata = {
"active_locale": active_locale,
"available_providers": available,
"session_providers": session.get_providers(),
"provider": provider,
"model": str(model_path),
"inputs": {
name: {
"dtype": str(value.dtype),
"shape": list(value.shape),
}
for name, value in feeds.items()
},
"outputs": [
{
"dtype": str(output.dtype),
"shape": list(output.shape),
}
for output in outputs
],
"output_path": str(output_path),
}
output_path.with_suffix(".json").write_text(json.dumps(metadata, indent=2))
print(json.dumps(metadata, indent=2))
def compare_outputs(us_path: Path, de_path: Path) -> None:
us_outputs = load_outputs(us_path)
de_outputs = load_outputs(de_path)
if len(us_outputs) != len(de_outputs):
raise ValueError(f"Output count mismatch: {len(us_outputs)} vs {len(de_outputs)}")
us = np.concatenate([output.astype(np.float32).ravel() for output in us_outputs])
de = np.concatenate([output.astype(np.float32).ravel() for output in de_outputs])
diff = us - de
abs_diff = np.abs(diff)
max_index = int(abs_diff.argmax())
max_abs = float(abs_diff[max_index])
denom = np.maximum(np.abs(us), np.float32(1e-12))
max_rel = float((abs_diff / denom).max())
equal = all(np.array_equal(us_output, de_output) for us_output, de_output in zip(us_outputs, de_outputs))
report = {
"bitwise_equal": equal,
"max_abs_diff": max_abs,
"max_rel_diff": max_rel,
"max_diff_flat_index": max_index,
"us_value_at_max": float(us[max_index]),
"de_value_at_max": float(de[max_index]),
}
print(json.dumps(report, indent=2))
def compare_locales(model_path: Path, provider: str) -> None:
model_path = resolve_model_path(model_path)
output_dir = SCRIPT_DIR / "outputs"
output_dir.mkdir(parents=True, exist_ok=True)
provider_name = provider.replace("ExecutionProvider", "").lower()
us_path = output_dir / f"{model_path.stem}.{provider_name}.us.npy"
de_path = output_dir / f"{model_path.stem}.{provider_name}.de.npy"
for locale_key, output_path in (("us", us_path), ("de", de_path)):
command = [
sys.executable,
str(Path(__file__).resolve()),
"run",
"--model",
str(model_path),
"--provider",
provider,
"--locale",
locale_key,
"--output",
str(output_path),
]
print("running:", " ".join(command))
subprocess.run(command, check=True)
compare_outputs(us_path, de_path)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run ONNX models under US and German locales and compare outputs."
)
subparsers = parser.add_subparsers(dest="command", required=True)
run_parser = subparsers.add_parser("run", help="Run one model under one locale.")
run_parser.add_argument("--model", required=True, help="Model path or alias: gelu-fp16, gelu-fp32, tpn")
run_parser.add_argument("--provider", default="DmlExecutionProvider")
run_parser.add_argument("--locale", choices=sorted(LOCALE_CANDIDATES), required=True)
run_parser.add_argument("--output", type=Path, required=True)
compare_parser = subparsers.add_parser(
"compare", help="Run US and German locale child processes and compare outputs."
)
compare_parser.add_argument("--model", default="gelu-fp16", help="Model path or alias: gelu-fp16, gelu-fp32, tpn")
compare_parser.add_argument("--provider", default="DmlExecutionProvider")
return parser.parse_args()
def main() -> None:
args = parse_args()
if args.command == "run":
run_model(args.model, args.provider, args.locale, args.output)
elif args.command == "compare":
compare_locales(args.model, args.provider)
else:
raise AssertionError(args.command)
if __name__ == "__main__":
main()
gelu_opset20_float32.zip
Summary
When the process locale is set to German (
German_Germany.1252), DirectML produces all zeros for a minimal fp16 Gelu model (ONNX opset 20). Under US English locale the same model produces correct results. The fp32 variant works correctly under both locales.Environment
Reproduction
Minimal script and ONNX model attached below. The model contains a single
Gelunode (opset 20, approximate="none") with fp16 input/output.Observed output (fp16 + DML + German locale):
Control cases (all pass):
fp32 + DML: bitwise equal under both locales ✓
fp16 + CPU: bitwise equal under both locales ✓
Analysis
ONNX Runtime's DML EP code has no locale-sensitive float formatting — all operator descriptors pass binary float values via DML_TENSOR_DESC structs. The opset-20 Gelu is decomposed into Erf/Mul/Add/Div primitives (since DML only registers contrib-domain Gelu), all of which DML claims for fp16 natively without ORT inserting casts.
The bug appears to be inside DirectML.dll's fp16 shader compilation or initialization path, where a locale-dependent float-to-string conversion (e.g., 0.5 → 0,5) may produce invalid HLSL or broken constants.
Replace FLOAT16 with FLOAT for the fp32 control model.
Repro:
gelu_opset20_float32.zip