Narendasan/test dx#4376
Open
narendasan wants to merge 2 commits into
Open
Conversation
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-06-30 21:52:03.788259+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest: python -m tests.ci {list,show,run,matrix,doctor}
- list all suites, tiers, lanes, variants
- show <name> a suite's resolved command per variant
- run <name> [opts] [-- ...] run one suite (the call CI + just both make)
- matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
- doctor validate the manifest (CI lints this)
+list all suites, tiers, lanes, variants
+show <name> a suite's resolved command per variant
+run <name> [opts] [-- ...] run one suite (the call CI + just both make)
+matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
+doctor validate the manifest (CI lints this)
"""
from __future__ import annotations
import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name
def _cmd_list(_: argparse.Namespace) -> int:
width = max(len(s.name) for s in SUITES)
- print(f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS")
+ print(
+ f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS"
+ )
for s in SUITES:
- print(f"{s.name.ljust(width)} {s.tier:<4} "
- f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}")
- print(f"\n{len(SUITES)} suites. "
- f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)")
+ print(
+ f"{s.name.ljust(width)} {s.tier:<4} "
+ f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}"
+ )
+ print(
+ f"\n{len(SUITES)} suites. "
+ f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)"
+ )
return 0
def _cmd_show(args: argparse.Namespace) -> int:
s = by_name(args.name)
@@ -39,36 +45,41 @@
def _cmd_run(args: argparse.Namespace) -> int:
s = by_name(args.name)
variants = [args.variant] if args.variant else list(s.variants)
if args.variant and args.variant not in s.variants:
- print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
- f"it runs on {s.variants}", file=sys.stderr)
+ print(
+ f"::warning::{s.name} does not run on variant {args.variant!r}; "
+ f"it runs on {s.variants}",
+ file=sys.stderr,
+ )
return 0
rc = 0
for var in variants:
rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
return rc
def _cmd_run_lane(args: argparse.Namespace) -> int:
"""Run every suite in a lane/tier, continuing past failures (so one consolidated
report sees them all). Returns non-zero if any suite failed."""
- jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ jobs = select(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not jobs:
print("::warning::no suites match the given filters", file=sys.stderr)
return 0
rc = 0
for s, var in jobs:
rc = run_suite(s, var, dry_run=args.dry_run) or rc
return rc
def _cmd_matrix(args: argparse.Namespace) -> int:
- include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ include = matrix(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not include:
print("::warning::matrix is empty for the given filters", file=sys.stderr)
print(json.dumps({"include": include}))
return 0
@@ -109,12 +120,14 @@
if problems:
for p in problems:
print(f"✗ {p}", file=sys.stderr)
print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
return 1
- print(f"✓ manifest OK — {len(SUITES)} suites, "
- f"{len(set(junits))} unique junit paths, no collisions.")
+ print(
+ f"✓ manifest OK — {len(SUITES)} suites, "
+ f"{len(set(junits))} unique junit paths, no collisions."
+ )
return 0
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
sp.set_defaults(fn=_cmd_show)
sp = sub.add_parser("run", help="run one suite")
sp.add_argument("name")
sp.add_argument("--variant", choices=("standard", "rtx"))
- sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
- sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
- "(use `-- -k foo`)")
+ sp.add_argument(
+ "--dry-run", action="store_true", help="print the command, don't run"
+ )
+ sp.add_argument(
+ "pytest_args",
+ nargs="*",
+ help="extra args forwarded to pytest " "(use `-- -k foo`)",
+ )
sp.set_defaults(fn=_cmd_run)
- sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+ sp = sub.add_parser(
+ "run-lane", help="run every suite in a lane/tier, past failures"
+ )
g = sp.add_mutually_exclusive_group()
g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
g.add_argument("--tier", choices=("l0", "l1", "l2"))
sp.add_argument("--variant", choices=("standard", "rtx"))
sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-06-30 21:52:03.844831+00:00
@@ -22,13 +22,18 @@
)
# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
- "--reruns", "1", "--reruns-delay", "5",
- "--only-rerun", "cudaErrorStreamCaptureInvalidated",
- "--only-rerun", "Stream capture invalidated",
+ "--reruns",
+ "1",
+ "--reruns-delay",
+ "5",
+ "--only-rerun",
+ "cudaErrorStreamCaptureInvalidated",
+ "--only-rerun",
+ "Stream capture invalidated",
]
def _launcher() -> list[str]:
"""The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
"""(argv, cwd) pairs for a named setup step."""
launcher = _launcher()
if step == "hub":
return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
if step == "executorch":
- return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
- REPO_ROOT)]
+ return [
+ (
+ launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+ REPO_ROOT,
+ )
+ ]
if step == "cuda-core":
- return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
- REPO_ROOT)]
+ return [
+ (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+ ]
if step == "mpi":
- return [(["dnf", "install", "-y", "mpich", "mpich-devel",
- "openmpi", "openmpi-devel"], REPO_ROOT)]
+ return [
+ (
+ [
+ "dnf",
+ "install",
+ "-y",
+ "mpich",
+ "mpich-devel",
+ "openmpi",
+ "openmpi-devel",
+ ],
+ REPO_ROOT,
+ )
+ ]
raise KeyError(f"unknown setup step {step!r} in a suite definition")
def describe(suite: Suite, variant: Variant) -> str:
"""The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
process exit code (non-zero on first failure), mirroring the bash tiers."""
v = suite.for_variant(variant)
extra = extra or []
env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
cwd = REPO_ROOT / v["cwd"]
- pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ pytest_cmd = (
+ _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ )
if dry_run:
print(describe(suite, variant))
if extra:
print(f" # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
print(f"::warning::setup step {step!r} exited {rc}", flush=True)
print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
if rc != 0:
- repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
- + build_pytest_args(suite, variant) + extra)
- print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
- flush=True)
+ repro = shlex.join(
+ ["uv", "run", "--no-sync", "pytest"]
+ + build_pytest_args(suite, variant)
+ + extra
+ )
+ print(
+ f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+ flush=True,
+ )
return rc
for f in v["follow"]:
fcmd = _launcher() + list(f)
print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@
def matrix(**filters: str | None) -> list[dict[str, str]]:
"""GitHub-Actions matrix ``include`` entries for the selected jobs."""
return [
- {"suite": s.name, "variant": var, "tier": s.tier,
- "cwd": s.for_variant(var)["cwd"]}
+ {
+ "suite": s.name,
+ "variant": var,
+ "tier": s.tier,
+ "cwd": s.for_variant(var)["cwd"],
+ }
for s, var in select(**filters)
]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-06-30 21:52:03.892690+00:00
@@ -54,172 +54,286 @@
"""
name: str
tier: Tier
lanes: tuple[Lane, ...]
- cwd: str = "tests/py/dynamo" # relative to repo root
- paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
- markers: str | None = None # -m EXPR
- keyword: str | None = None # -k EXPR
- dist: str | None = None # --dist=loadscope
- maxfail: int | None = None # --maxfail=N
- ir: str | None = None # --ir torch_compile
- jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
- reruns: bool = True # wrap in the flake-rerun helper
- verbose: bool = False # -v
+ cwd: str = "tests/py/dynamo" # relative to repo root
+ paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
+ markers: str | None = None # -m EXPR
+ keyword: str | None = None # -k EXPR
+ dist: str | None = None # --dist=loadscope
+ maxfail: int | None = None # --maxfail=N
+ ir: str | None = None # --ir torch_compile
+ jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
+ reruns: bool = True # wrap in the flake-rerun helper
+ verbose: bool = False # -v
variants: tuple[Variant, ...] = ALL_VARIANTS
- platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
- setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
- follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
+ platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
+ setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
+ follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
env: dict[str, str] = field(default_factory=dict)
overrides: dict[str, dict[str, Any]] = field(default_factory=dict) # per-variant
def for_variant(self, variant: Variant) -> dict[str, Any]:
"""This suite's effective fields for ``variant`` (applies overrides)."""
- base = {f: getattr(self, f) for f in (
- "cwd", "paths", "markers", "keyword", "dist", "maxfail",
- "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
- )}
+ base = {
+ f: getattr(self, f)
+ for f in (
+ "cwd",
+ "paths",
+ "markers",
+ "keyword",
+ "dist",
+ "maxfail",
+ "ir",
+ "jobs",
+ "reruns",
+ "verbose",
+ "setup",
+ "follow",
+ "env",
+ )
+ }
base.update(self.overrides.get(variant, {}))
return base
# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
Suite(
- "dynamo-converters", tier="l0", lanes=("fast", "full"),
- paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+ "dynamo-converters",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("conversion/",),
+ dist="--dist=loadscope",
+ maxfail=20,
+ jobs="8",
# RTX does not shard converters with loadscope.
overrides={"rtx": {"dist": None}},
),
Suite(
- "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
- paths=("runtime/test_000_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
- paths=("partitioning/test_000_*",), jobs="8",
+ "dynamo-runtime-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("runtime/test_000_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("partitioning/test_000_*",),
+ jobs="8",
# RTX runs the whole partitioning suite (no smoke subset split).
overrides={"rtx": {"paths": ("partitioning/",)}},
),
Suite(
- "dynamo-lowering", tier="l0", lanes=("fast", "full"),
- paths=("lowering/",), jobs="8",
- ),
- Suite(
- "py-core", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/core", paths=(".",), jobs="8",
- ),
- Suite(
- "ts-api", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+ "dynamo-lowering",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("lowering/",),
+ jobs="8",
+ ),
+ Suite(
+ "py-core",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/core",
+ paths=(".",),
+ jobs="8",
+ ),
+ Suite(
+ "ts-api",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/ts",
+ paths=("api/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
Suite(
- "dynamo-runtime", tier="l1", lanes=("full",),
- paths=("runtime/test_001_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning", tier="l1", lanes=("full",),
- paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+ "dynamo-runtime",
+ tier="l1",
+ lanes=("full",),
+ paths=("runtime/test_001_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning",
+ tier="l1",
+ lanes=("full",),
+ paths=("partitioning/test_001_*",),
+ jobs="8",
+ variants=("standard",),
),
Suite(
# Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
- "dynamo-hlo", tier="l1", lanes=("full",),
- paths=("hlo/",), jobs="8",
- ),
- Suite(
- "dynamo-models-critical", tier="l1", lanes=("full",),
- paths=("models/",), markers="critical",
- ),
- Suite(
- "torch-compile-backend", tier="l1", lanes=("full",),
+ "dynamo-hlo",
+ tier="l1",
+ lanes=("full",),
+ paths=("hlo/",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-models-critical",
+ tier="l1",
+ lanes=("full",),
+ paths=("models/",),
+ markers="critical",
+ ),
+ Suite(
+ "torch-compile-backend",
+ tier="l1",
+ lanes=("full",),
paths=("backend/",),
),
Suite(
- "torch-compile-models-critical", tier="l1", lanes=("full",),
+ "torch-compile-models-critical",
+ tier="l1",
+ lanes=("full",),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="critical", ir="torch_compile",
- ),
- Suite(
- "ts-models", tier="l1", lanes=("full",),
- cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+ markers="critical",
+ ir="torch_compile",
+ ),
+ Suite(
+ "ts-models",
+ tier="l1",
+ lanes=("full",),
+ cwd="tests/py/ts",
+ paths=("models/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
Suite(
- "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+ "torch-compile-models",
+ tier="l2",
+ lanes=("full", "nightly"),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="not critical", ir="torch_compile", jobs="auto",
- ),
- Suite(
- "dynamo-models", tier="l2", lanes=("full", "nightly"),
- paths=("models/",), markers="not critical", jobs="auto",
- ),
- Suite(
- "dynamo-llm", tier="l2", lanes=("nightly",),
- paths=("llm/",), jobs="auto",
- ),
- Suite(
- "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
- paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
- ),
- Suite(
- "executorch", tier="l2", lanes=("nightly",),
- paths=("executorch/",), setup=("executorch",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
+ markers="not critical",
+ ir="torch_compile",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-models",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("models/",),
+ markers="not critical",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-llm",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("llm/",),
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-runtime-full",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("runtime/",),
+ keyword="not test_000_ and not test_001_",
+ jobs="auto",
+ ),
+ Suite(
+ "executorch",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("executorch/",),
+ setup=("executorch",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
),
Suite(
# Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
# (The redundant conversion/ re-run from the old l2_plugin is dropped.)
- "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+ "plugins-automatic",
+ tier="l2",
+ lanes=("nightly",),
+ jobs="auto",
paths=(
"automatic_plugin/test_automatic_plugin.py",
"automatic_plugin/test_automatic_plugin_with_attrs.py",
"automatic_plugin/test_flashinfer_rmsnorm.py",
),
overrides={"rtx": {"paths": ("automatic_plugin/",)}},
),
Suite(
- "kernels", tier="l2", lanes=("nightly",),
- cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
- ),
- Suite(
- "ts-integrations", tier="l2", lanes=("nightly",),
- cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
- variants=("standard",),
- ),
- Suite(
- "distributed", tier="l2", lanes=("nightly",),
+ "kernels",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/kernels",
+ paths=(".",),
+ setup=("cuda-core",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ ),
+ Suite(
+ "ts-integrations",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/ts",
+ paths=("integrations/",),
+ setup=("hub",),
+ jobs="auto",
+ variants=("standard",),
+ ),
+ Suite(
+ "distributed",
+ tier="l2",
+ lanes=("nightly",),
paths=(
"distributed/test_nccl_ops.py",
"distributed/test_native_nccl.py",
"distributed/test_export_save_load.py",
),
- jobs="auto", verbose=True, reruns=False, variants=("standard",),
- platforms=("linux-x86_64",), setup=("mpi",),
+ jobs="auto",
+ verbose=True,
+ reruns=False,
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ setup=("mpi",),
env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
follow=(
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_native_nccl.py", "--multirank"),
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_export_save_load.py", "--multirank"),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_native_nccl.py",
+ "--multirank",
+ ),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_export_save_load.py",
+ "--multirank",
+ ),
),
),
]
# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
Suite(
- "python-only-runtime", tier="l1", lanes=("python-only",),
- paths=("runtime/",), jobs="8", variants=("standard",),
+ "python-only-runtime",
+ tier="l1",
+ lanes=("python-only",),
+ paths=("runtime/",),
+ jobs="8",
+ variants=("standard",),
),
]
SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)
1e92f97 to
b58b80a
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-01 18:50:01.703469+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest: python -m tests.ci {list,show,run,matrix,doctor}
- list all suites, tiers, lanes, variants
- show <name> a suite's resolved command per variant
- run <name> [opts] [-- ...] run one suite (the call CI + just both make)
- matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
- doctor validate the manifest (CI lints this)
+list all suites, tiers, lanes, variants
+show <name> a suite's resolved command per variant
+run <name> [opts] [-- ...] run one suite (the call CI + just both make)
+matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
+doctor validate the manifest (CI lints this)
"""
from __future__ import annotations
import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name
def _cmd_list(_: argparse.Namespace) -> int:
width = max(len(s.name) for s in SUITES)
- print(f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS")
+ print(
+ f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS"
+ )
for s in SUITES:
- print(f"{s.name.ljust(width)} {s.tier:<4} "
- f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}")
- print(f"\n{len(SUITES)} suites. "
- f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)")
+ print(
+ f"{s.name.ljust(width)} {s.tier:<4} "
+ f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}"
+ )
+ print(
+ f"\n{len(SUITES)} suites. "
+ f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)"
+ )
return 0
def _cmd_show(args: argparse.Namespace) -> int:
s = by_name(args.name)
@@ -39,36 +45,41 @@
def _cmd_run(args: argparse.Namespace) -> int:
s = by_name(args.name)
variants = [args.variant] if args.variant else list(s.variants)
if args.variant and args.variant not in s.variants:
- print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
- f"it runs on {s.variants}", file=sys.stderr)
+ print(
+ f"::warning::{s.name} does not run on variant {args.variant!r}; "
+ f"it runs on {s.variants}",
+ file=sys.stderr,
+ )
return 0
rc = 0
for var in variants:
rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
return rc
def _cmd_run_lane(args: argparse.Namespace) -> int:
"""Run every suite in a lane/tier, continuing past failures (so one consolidated
report sees them all). Returns non-zero if any suite failed."""
- jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ jobs = select(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not jobs:
print("::warning::no suites match the given filters", file=sys.stderr)
return 0
rc = 0
for s, var in jobs:
rc = run_suite(s, var, dry_run=args.dry_run) or rc
return rc
def _cmd_matrix(args: argparse.Namespace) -> int:
- include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ include = matrix(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not include:
print("::warning::matrix is empty for the given filters", file=sys.stderr)
print(json.dumps({"include": include}))
return 0
@@ -109,12 +120,14 @@
if problems:
for p in problems:
print(f"✗ {p}", file=sys.stderr)
print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
return 1
- print(f"✓ manifest OK — {len(SUITES)} suites, "
- f"{len(set(junits))} unique junit paths, no collisions.")
+ print(
+ f"✓ manifest OK — {len(SUITES)} suites, "
+ f"{len(set(junits))} unique junit paths, no collisions."
+ )
return 0
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
sp.set_defaults(fn=_cmd_show)
sp = sub.add_parser("run", help="run one suite")
sp.add_argument("name")
sp.add_argument("--variant", choices=("standard", "rtx"))
- sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
- sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
- "(use `-- -k foo`)")
+ sp.add_argument(
+ "--dry-run", action="store_true", help="print the command, don't run"
+ )
+ sp.add_argument(
+ "pytest_args",
+ nargs="*",
+ help="extra args forwarded to pytest " "(use `-- -k foo`)",
+ )
sp.set_defaults(fn=_cmd_run)
- sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+ sp = sub.add_parser(
+ "run-lane", help="run every suite in a lane/tier, past failures"
+ )
g = sp.add_mutually_exclusive_group()
g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
g.add_argument("--tier", choices=("l0", "l1", "l2"))
sp.add_argument("--variant", choices=("standard", "rtx"))
sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-01 18:50:01.775392+00:00
@@ -22,13 +22,18 @@
)
# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
- "--reruns", "1", "--reruns-delay", "5",
- "--only-rerun", "cudaErrorStreamCaptureInvalidated",
- "--only-rerun", "Stream capture invalidated",
+ "--reruns",
+ "1",
+ "--reruns-delay",
+ "5",
+ "--only-rerun",
+ "cudaErrorStreamCaptureInvalidated",
+ "--only-rerun",
+ "Stream capture invalidated",
]
def _launcher() -> list[str]:
"""The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
"""(argv, cwd) pairs for a named setup step."""
launcher = _launcher()
if step == "hub":
return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
if step == "executorch":
- return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
- REPO_ROOT)]
+ return [
+ (
+ launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+ REPO_ROOT,
+ )
+ ]
if step == "cuda-core":
- return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
- REPO_ROOT)]
+ return [
+ (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+ ]
if step == "mpi":
- return [(["dnf", "install", "-y", "mpich", "mpich-devel",
- "openmpi", "openmpi-devel"], REPO_ROOT)]
+ return [
+ (
+ [
+ "dnf",
+ "install",
+ "-y",
+ "mpich",
+ "mpich-devel",
+ "openmpi",
+ "openmpi-devel",
+ ],
+ REPO_ROOT,
+ )
+ ]
raise KeyError(f"unknown setup step {step!r} in a suite definition")
def describe(suite: Suite, variant: Variant) -> str:
"""The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
process exit code (non-zero on first failure), mirroring the bash tiers."""
v = suite.for_variant(variant)
extra = extra or []
env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
cwd = REPO_ROOT / v["cwd"]
- pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ pytest_cmd = (
+ _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ )
if dry_run:
print(describe(suite, variant))
if extra:
print(f" # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
print(f"::warning::setup step {step!r} exited {rc}", flush=True)
print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
if rc != 0:
- repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
- + build_pytest_args(suite, variant) + extra)
- print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
- flush=True)
+ repro = shlex.join(
+ ["uv", "run", "--no-sync", "pytest"]
+ + build_pytest_args(suite, variant)
+ + extra
+ )
+ print(
+ f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+ flush=True,
+ )
return rc
for f in v["follow"]:
fcmd = _launcher() + list(f)
print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@
def matrix(**filters: str | None) -> list[dict[str, str]]:
"""GitHub-Actions matrix ``include`` entries for the selected jobs."""
return [
- {"suite": s.name, "variant": var, "tier": s.tier,
- "cwd": s.for_variant(var)["cwd"]}
+ {
+ "suite": s.name,
+ "variant": var,
+ "tier": s.tier,
+ "cwd": s.for_variant(var)["cwd"],
+ }
for s, var in select(**filters)
]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-01 18:50:01.849014+00:00
@@ -54,172 +54,286 @@
"""
name: str
tier: Tier
lanes: tuple[Lane, ...]
- cwd: str = "tests/py/dynamo" # relative to repo root
- paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
- markers: str | None = None # -m EXPR
- keyword: str | None = None # -k EXPR
- dist: str | None = None # --dist=loadscope
- maxfail: int | None = None # --maxfail=N
- ir: str | None = None # --ir torch_compile
- jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
- reruns: bool = True # wrap in the flake-rerun helper
- verbose: bool = False # -v
+ cwd: str = "tests/py/dynamo" # relative to repo root
+ paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
+ markers: str | None = None # -m EXPR
+ keyword: str | None = None # -k EXPR
+ dist: str | None = None # --dist=loadscope
+ maxfail: int | None = None # --maxfail=N
+ ir: str | None = None # --ir torch_compile
+ jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
+ reruns: bool = True # wrap in the flake-rerun helper
+ verbose: bool = False # -v
variants: tuple[Variant, ...] = ALL_VARIANTS
- platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
- setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
- follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
+ platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
+ setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
+ follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
env: dict[str, str] = field(default_factory=dict)
overrides: dict[str, dict[str, Any]] = field(default_factory=dict) # per-variant
def for_variant(self, variant: Variant) -> dict[str, Any]:
"""This suite's effective fields for ``variant`` (applies overrides)."""
- base = {f: getattr(self, f) for f in (
- "cwd", "paths", "markers", "keyword", "dist", "maxfail",
- "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
- )}
+ base = {
+ f: getattr(self, f)
+ for f in (
+ "cwd",
+ "paths",
+ "markers",
+ "keyword",
+ "dist",
+ "maxfail",
+ "ir",
+ "jobs",
+ "reruns",
+ "verbose",
+ "setup",
+ "follow",
+ "env",
+ )
+ }
base.update(self.overrides.get(variant, {}))
return base
# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
Suite(
- "dynamo-converters", tier="l0", lanes=("fast", "full"),
- paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+ "dynamo-converters",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("conversion/",),
+ dist="--dist=loadscope",
+ maxfail=20,
+ jobs="8",
# RTX does not shard converters with loadscope.
overrides={"rtx": {"dist": None}},
),
Suite(
- "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
- paths=("runtime/test_000_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
- paths=("partitioning/test_000_*",), jobs="8",
+ "dynamo-runtime-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("runtime/test_000_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("partitioning/test_000_*",),
+ jobs="8",
# RTX runs the whole partitioning suite (no smoke subset split).
overrides={"rtx": {"paths": ("partitioning/",)}},
),
Suite(
- "dynamo-lowering", tier="l0", lanes=("fast", "full"),
- paths=("lowering/",), jobs="8",
- ),
- Suite(
- "py-core", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/core", paths=(".",), jobs="8",
- ),
- Suite(
- "ts-api", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+ "dynamo-lowering",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("lowering/",),
+ jobs="8",
+ ),
+ Suite(
+ "py-core",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/core",
+ paths=(".",),
+ jobs="8",
+ ),
+ Suite(
+ "ts-api",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/ts",
+ paths=("api/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
Suite(
- "dynamo-runtime", tier="l1", lanes=("full",),
- paths=("runtime/test_001_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning", tier="l1", lanes=("full",),
- paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+ "dynamo-runtime",
+ tier="l1",
+ lanes=("full",),
+ paths=("runtime/test_001_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning",
+ tier="l1",
+ lanes=("full",),
+ paths=("partitioning/test_001_*",),
+ jobs="8",
+ variants=("standard",),
),
Suite(
# Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
- "dynamo-hlo", tier="l1", lanes=("full",),
- paths=("hlo/",), jobs="8",
- ),
- Suite(
- "dynamo-models-critical", tier="l1", lanes=("full",),
- paths=("models/",), markers="critical",
- ),
- Suite(
- "torch-compile-backend", tier="l1", lanes=("full",),
+ "dynamo-hlo",
+ tier="l1",
+ lanes=("full",),
+ paths=("hlo/",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-models-critical",
+ tier="l1",
+ lanes=("full",),
+ paths=("models/",),
+ markers="critical",
+ ),
+ Suite(
+ "torch-compile-backend",
+ tier="l1",
+ lanes=("full",),
paths=("backend/",),
),
Suite(
- "torch-compile-models-critical", tier="l1", lanes=("full",),
+ "torch-compile-models-critical",
+ tier="l1",
+ lanes=("full",),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="critical", ir="torch_compile",
- ),
- Suite(
- "ts-models", tier="l1", lanes=("full",),
- cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+ markers="critical",
+ ir="torch_compile",
+ ),
+ Suite(
+ "ts-models",
+ tier="l1",
+ lanes=("full",),
+ cwd="tests/py/ts",
+ paths=("models/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
Suite(
- "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+ "torch-compile-models",
+ tier="l2",
+ lanes=("full", "nightly"),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="not critical", ir="torch_compile", jobs="auto",
- ),
- Suite(
- "dynamo-models", tier="l2", lanes=("full", "nightly"),
- paths=("models/",), markers="not critical", jobs="auto",
- ),
- Suite(
- "dynamo-llm", tier="l2", lanes=("nightly",),
- paths=("llm/",), jobs="auto",
- ),
- Suite(
- "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
- paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
- ),
- Suite(
- "executorch", tier="l2", lanes=("nightly",),
- paths=("executorch/",), setup=("executorch",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
+ markers="not critical",
+ ir="torch_compile",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-models",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("models/",),
+ markers="not critical",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-llm",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("llm/",),
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-runtime-full",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("runtime/",),
+ keyword="not test_000_ and not test_001_",
+ jobs="auto",
+ ),
+ Suite(
+ "executorch",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("executorch/",),
+ setup=("executorch",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
),
Suite(
# Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
# (The redundant conversion/ re-run from the old l2_plugin is dropped.)
- "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+ "plugins-automatic",
+ tier="l2",
+ lanes=("nightly",),
+ jobs="auto",
paths=(
"automatic_plugin/test_automatic_plugin.py",
"automatic_plugin/test_automatic_plugin_with_attrs.py",
"automatic_plugin/test_flashinfer_rmsnorm.py",
),
overrides={"rtx": {"paths": ("automatic_plugin/",)}},
),
Suite(
- "kernels", tier="l2", lanes=("nightly",),
- cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
- ),
- Suite(
- "ts-integrations", tier="l2", lanes=("nightly",),
- cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
- variants=("standard",),
- ),
- Suite(
- "distributed", tier="l2", lanes=("nightly",),
+ "kernels",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/kernels",
+ paths=(".",),
+ setup=("cuda-core",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ ),
+ Suite(
+ "ts-integrations",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/ts",
+ paths=("integrations/",),
+ setup=("hub",),
+ jobs="auto",
+ variants=("standard",),
+ ),
+ Suite(
+ "distributed",
+ tier="l2",
+ lanes=("nightly",),
paths=(
"distributed/test_nccl_ops.py",
"distributed/test_native_nccl.py",
"distributed/test_export_save_load.py",
),
- jobs="auto", verbose=True, reruns=False, variants=("standard",),
- platforms=("linux-x86_64",), setup=("mpi",),
+ jobs="auto",
+ verbose=True,
+ reruns=False,
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ setup=("mpi",),
env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
follow=(
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_native_nccl.py", "--multirank"),
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_export_save_load.py", "--multirank"),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_native_nccl.py",
+ "--multirank",
+ ),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_export_save_load.py",
+ "--multirank",
+ ),
),
),
]
# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
Suite(
- "python-only-runtime", tier="l1", lanes=("python-only",),
- paths=("runtime/",), jobs="8", variants=("standard",),
+ "python-only-runtime",
+ tier="l1",
+ lanes=("python-only",),
+ paths=("runtime/",),
+ jobs="8",
+ variants=("standard",),
),
]
SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)
b58b80a to
8e99636
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-01 19:10:36.520231+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest: python -m tests.ci {list,show,run,matrix,doctor}
- list all suites, tiers, lanes, variants
- show <name> a suite's resolved command per variant
- run <name> [opts] [-- ...] run one suite (the call CI + just both make)
- matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
- doctor validate the manifest (CI lints this)
+list all suites, tiers, lanes, variants
+show <name> a suite's resolved command per variant
+run <name> [opts] [-- ...] run one suite (the call CI + just both make)
+matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
+doctor validate the manifest (CI lints this)
"""
from __future__ import annotations
import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name
def _cmd_list(_: argparse.Namespace) -> int:
width = max(len(s.name) for s in SUITES)
- print(f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS")
+ print(
+ f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS"
+ )
for s in SUITES:
- print(f"{s.name.ljust(width)} {s.tier:<4} "
- f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}")
- print(f"\n{len(SUITES)} suites. "
- f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)")
+ print(
+ f"{s.name.ljust(width)} {s.tier:<4} "
+ f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}"
+ )
+ print(
+ f"\n{len(SUITES)} suites. "
+ f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)"
+ )
return 0
def _cmd_show(args: argparse.Namespace) -> int:
s = by_name(args.name)
@@ -39,36 +45,41 @@
def _cmd_run(args: argparse.Namespace) -> int:
s = by_name(args.name)
variants = [args.variant] if args.variant else list(s.variants)
if args.variant and args.variant not in s.variants:
- print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
- f"it runs on {s.variants}", file=sys.stderr)
+ print(
+ f"::warning::{s.name} does not run on variant {args.variant!r}; "
+ f"it runs on {s.variants}",
+ file=sys.stderr,
+ )
return 0
rc = 0
for var in variants:
rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
return rc
def _cmd_run_lane(args: argparse.Namespace) -> int:
"""Run every suite in a lane/tier, continuing past failures (so one consolidated
report sees them all). Returns non-zero if any suite failed."""
- jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ jobs = select(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not jobs:
print("::warning::no suites match the given filters", file=sys.stderr)
return 0
rc = 0
for s, var in jobs:
rc = run_suite(s, var, dry_run=args.dry_run) or rc
return rc
def _cmd_matrix(args: argparse.Namespace) -> int:
- include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ include = matrix(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not include:
print("::warning::matrix is empty for the given filters", file=sys.stderr)
print(json.dumps({"include": include}))
return 0
@@ -109,12 +120,14 @@
if problems:
for p in problems:
print(f"✗ {p}", file=sys.stderr)
print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
return 1
- print(f"✓ manifest OK — {len(SUITES)} suites, "
- f"{len(set(junits))} unique junit paths, no collisions.")
+ print(
+ f"✓ manifest OK — {len(SUITES)} suites, "
+ f"{len(set(junits))} unique junit paths, no collisions."
+ )
return 0
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
sp.set_defaults(fn=_cmd_show)
sp = sub.add_parser("run", help="run one suite")
sp.add_argument("name")
sp.add_argument("--variant", choices=("standard", "rtx"))
- sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
- sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
- "(use `-- -k foo`)")
+ sp.add_argument(
+ "--dry-run", action="store_true", help="print the command, don't run"
+ )
+ sp.add_argument(
+ "pytest_args",
+ nargs="*",
+ help="extra args forwarded to pytest " "(use `-- -k foo`)",
+ )
sp.set_defaults(fn=_cmd_run)
- sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+ sp = sub.add_parser(
+ "run-lane", help="run every suite in a lane/tier, past failures"
+ )
g = sp.add_mutually_exclusive_group()
g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
g.add_argument("--tier", choices=("l0", "l1", "l2"))
sp.add_argument("--variant", choices=("standard", "rtx"))
sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-01 19:10:36.590423+00:00
@@ -22,13 +22,18 @@
)
# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
- "--reruns", "1", "--reruns-delay", "5",
- "--only-rerun", "cudaErrorStreamCaptureInvalidated",
- "--only-rerun", "Stream capture invalidated",
+ "--reruns",
+ "1",
+ "--reruns-delay",
+ "5",
+ "--only-rerun",
+ "cudaErrorStreamCaptureInvalidated",
+ "--only-rerun",
+ "Stream capture invalidated",
]
def _launcher() -> list[str]:
"""The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
"""(argv, cwd) pairs for a named setup step."""
launcher = _launcher()
if step == "hub":
return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
if step == "executorch":
- return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
- REPO_ROOT)]
+ return [
+ (
+ launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+ REPO_ROOT,
+ )
+ ]
if step == "cuda-core":
- return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
- REPO_ROOT)]
+ return [
+ (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+ ]
if step == "mpi":
- return [(["dnf", "install", "-y", "mpich", "mpich-devel",
- "openmpi", "openmpi-devel"], REPO_ROOT)]
+ return [
+ (
+ [
+ "dnf",
+ "install",
+ "-y",
+ "mpich",
+ "mpich-devel",
+ "openmpi",
+ "openmpi-devel",
+ ],
+ REPO_ROOT,
+ )
+ ]
raise KeyError(f"unknown setup step {step!r} in a suite definition")
def describe(suite: Suite, variant: Variant) -> str:
"""The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
process exit code (non-zero on first failure), mirroring the bash tiers."""
v = suite.for_variant(variant)
extra = extra or []
env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
cwd = REPO_ROOT / v["cwd"]
- pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ pytest_cmd = (
+ _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ )
if dry_run:
print(describe(suite, variant))
if extra:
print(f" # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
print(f"::warning::setup step {step!r} exited {rc}", flush=True)
print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
if rc != 0:
- repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
- + build_pytest_args(suite, variant) + extra)
- print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
- flush=True)
+ repro = shlex.join(
+ ["uv", "run", "--no-sync", "pytest"]
+ + build_pytest_args(suite, variant)
+ + extra
+ )
+ print(
+ f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+ flush=True,
+ )
return rc
for f in v["follow"]:
fcmd = _launcher() + list(f)
print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@
def matrix(**filters: str | None) -> list[dict[str, str]]:
"""GitHub-Actions matrix ``include`` entries for the selected jobs."""
return [
- {"suite": s.name, "variant": var, "tier": s.tier,
- "cwd": s.for_variant(var)["cwd"]}
+ {
+ "suite": s.name,
+ "variant": var,
+ "tier": s.tier,
+ "cwd": s.for_variant(var)["cwd"],
+ }
for s, var in select(**filters)
]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-01 19:10:36.641696+00:00
@@ -54,172 +54,286 @@
"""
name: str
tier: Tier
lanes: tuple[Lane, ...]
- cwd: str = "tests/py/dynamo" # relative to repo root
- paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
- markers: str | None = None # -m EXPR
- keyword: str | None = None # -k EXPR
- dist: str | None = None # --dist=loadscope
- maxfail: int | None = None # --maxfail=N
- ir: str | None = None # --ir torch_compile
- jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
- reruns: bool = True # wrap in the flake-rerun helper
- verbose: bool = False # -v
+ cwd: str = "tests/py/dynamo" # relative to repo root
+ paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
+ markers: str | None = None # -m EXPR
+ keyword: str | None = None # -k EXPR
+ dist: str | None = None # --dist=loadscope
+ maxfail: int | None = None # --maxfail=N
+ ir: str | None = None # --ir torch_compile
+ jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
+ reruns: bool = True # wrap in the flake-rerun helper
+ verbose: bool = False # -v
variants: tuple[Variant, ...] = ALL_VARIANTS
- platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
- setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
- follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
+ platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
+ setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
+ follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
env: dict[str, str] = field(default_factory=dict)
overrides: dict[str, dict[str, Any]] = field(default_factory=dict) # per-variant
def for_variant(self, variant: Variant) -> dict[str, Any]:
"""This suite's effective fields for ``variant`` (applies overrides)."""
- base = {f: getattr(self, f) for f in (
- "cwd", "paths", "markers", "keyword", "dist", "maxfail",
- "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
- )}
+ base = {
+ f: getattr(self, f)
+ for f in (
+ "cwd",
+ "paths",
+ "markers",
+ "keyword",
+ "dist",
+ "maxfail",
+ "ir",
+ "jobs",
+ "reruns",
+ "verbose",
+ "setup",
+ "follow",
+ "env",
+ )
+ }
base.update(self.overrides.get(variant, {}))
return base
# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
Suite(
- "dynamo-converters", tier="l0", lanes=("fast", "full"),
- paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+ "dynamo-converters",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("conversion/",),
+ dist="--dist=loadscope",
+ maxfail=20,
+ jobs="8",
# RTX does not shard converters with loadscope.
overrides={"rtx": {"dist": None}},
),
Suite(
- "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
- paths=("runtime/test_000_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
- paths=("partitioning/test_000_*",), jobs="8",
+ "dynamo-runtime-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("runtime/test_000_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("partitioning/test_000_*",),
+ jobs="8",
# RTX runs the whole partitioning suite (no smoke subset split).
overrides={"rtx": {"paths": ("partitioning/",)}},
),
Suite(
- "dynamo-lowering", tier="l0", lanes=("fast", "full"),
- paths=("lowering/",), jobs="8",
- ),
- Suite(
- "py-core", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/core", paths=(".",), jobs="8",
- ),
- Suite(
- "ts-api", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+ "dynamo-lowering",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("lowering/",),
+ jobs="8",
+ ),
+ Suite(
+ "py-core",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/core",
+ paths=(".",),
+ jobs="8",
+ ),
+ Suite(
+ "ts-api",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/ts",
+ paths=("api/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
Suite(
- "dynamo-runtime", tier="l1", lanes=("full",),
- paths=("runtime/test_001_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning", tier="l1", lanes=("full",),
- paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+ "dynamo-runtime",
+ tier="l1",
+ lanes=("full",),
+ paths=("runtime/test_001_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning",
+ tier="l1",
+ lanes=("full",),
+ paths=("partitioning/test_001_*",),
+ jobs="8",
+ variants=("standard",),
),
Suite(
# Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
- "dynamo-hlo", tier="l1", lanes=("full",),
- paths=("hlo/",), jobs="8",
- ),
- Suite(
- "dynamo-models-critical", tier="l1", lanes=("full",),
- paths=("models/",), markers="critical",
- ),
- Suite(
- "torch-compile-backend", tier="l1", lanes=("full",),
+ "dynamo-hlo",
+ tier="l1",
+ lanes=("full",),
+ paths=("hlo/",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-models-critical",
+ tier="l1",
+ lanes=("full",),
+ paths=("models/",),
+ markers="critical",
+ ),
+ Suite(
+ "torch-compile-backend",
+ tier="l1",
+ lanes=("full",),
paths=("backend/",),
),
Suite(
- "torch-compile-models-critical", tier="l1", lanes=("full",),
+ "torch-compile-models-critical",
+ tier="l1",
+ lanes=("full",),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="critical", ir="torch_compile",
- ),
- Suite(
- "ts-models", tier="l1", lanes=("full",),
- cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+ markers="critical",
+ ir="torch_compile",
+ ),
+ Suite(
+ "ts-models",
+ tier="l1",
+ lanes=("full",),
+ cwd="tests/py/ts",
+ paths=("models/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
Suite(
- "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+ "torch-compile-models",
+ tier="l2",
+ lanes=("full", "nightly"),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="not critical", ir="torch_compile", jobs="auto",
- ),
- Suite(
- "dynamo-models", tier="l2", lanes=("full", "nightly"),
- paths=("models/",), markers="not critical", jobs="auto",
- ),
- Suite(
- "dynamo-llm", tier="l2", lanes=("nightly",),
- paths=("llm/",), jobs="auto",
- ),
- Suite(
- "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
- paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
- ),
- Suite(
- "executorch", tier="l2", lanes=("nightly",),
- paths=("executorch/",), setup=("executorch",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
+ markers="not critical",
+ ir="torch_compile",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-models",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("models/",),
+ markers="not critical",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-llm",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("llm/",),
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-runtime-full",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("runtime/",),
+ keyword="not test_000_ and not test_001_",
+ jobs="auto",
+ ),
+ Suite(
+ "executorch",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("executorch/",),
+ setup=("executorch",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
),
Suite(
# Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
# (The redundant conversion/ re-run from the old l2_plugin is dropped.)
- "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+ "plugins-automatic",
+ tier="l2",
+ lanes=("nightly",),
+ jobs="auto",
paths=(
"automatic_plugin/test_automatic_plugin.py",
"automatic_plugin/test_automatic_plugin_with_attrs.py",
"automatic_plugin/test_flashinfer_rmsnorm.py",
),
overrides={"rtx": {"paths": ("automatic_plugin/",)}},
),
Suite(
- "kernels", tier="l2", lanes=("nightly",),
- cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
- ),
- Suite(
- "ts-integrations", tier="l2", lanes=("nightly",),
- cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
- variants=("standard",),
- ),
- Suite(
- "distributed", tier="l2", lanes=("nightly",),
+ "kernels",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/kernels",
+ paths=(".",),
+ setup=("cuda-core",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ ),
+ Suite(
+ "ts-integrations",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/ts",
+ paths=("integrations/",),
+ setup=("hub",),
+ jobs="auto",
+ variants=("standard",),
+ ),
+ Suite(
+ "distributed",
+ tier="l2",
+ lanes=("nightly",),
paths=(
"distributed/test_nccl_ops.py",
"distributed/test_native_nccl.py",
"distributed/test_export_save_load.py",
),
- jobs="auto", verbose=True, reruns=False, variants=("standard",),
- platforms=("linux-x86_64",), setup=("mpi",),
+ jobs="auto",
+ verbose=True,
+ reruns=False,
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ setup=("mpi",),
env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
follow=(
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_native_nccl.py", "--multirank"),
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_export_save_load.py", "--multirank"),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_native_nccl.py",
+ "--multirank",
+ ),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_export_save_load.py",
+ "--multirank",
+ ),
),
),
]
# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
Suite(
- "python-only-runtime", tier="l1", lanes=("python-only",),
- paths=("runtime/",), jobs="8", variants=("standard",),
+ "python-only-runtime",
+ tier="l1",
+ lanes=("python-only",),
+ paths=("runtime/",),
+ jobs="8",
+ variants=("standard",),
),
]
SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)
8e99636 to
da667cd
Compare
…esign
Add tests/ci/{suites,runner,__main__}.py — a typed, declarative suite manifest
plus one runner (`python -m tests.ci`) that replaces the trt_tier_* bash
functions as the source of truth for what each test job runs. Faithfully
reproduces today's tier selection (verified via --dry-run) while fixing the
junit-path collisions, the double-run of hlo/, and giving every suite the
uniform flake-rerun+repro wrapper.
- pyproject: register smoke/flaky markers; norecursedirs += ci
- justfile: doctor / suites / suite / lane / report recipes (thin callers)
- TESTING_AND_CI_DESIGN.md: full local + CI design (manifest+runner, lanes
without merge queue, GHA-cache caching, aggregated agent-friendly reports)
da667cd to
58adfb6
Compare
ci(test-dx): grant id-token: write at ci.yml top level The reusable build jobs (build_linux/build_windows) request 'id-token: write' for OIDC; a reusable workflow can't exceed its caller's permission ceiling, so ci.yml's top-level 'contents: read' made GitHub reject it (startup_failure: "nested job 'build' is requesting 'id-token: write', but is only allowed 'id-token: none'"). Grant id-token: write + contents: read, matching the original per-platform entry workflows.
58adfb6 to
4577a82
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py 2026-07-02 22:45:58.621201+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest: python -m tests.ci {list,show,run,matrix,doctor}
- list all suites, tiers, lanes, variants
- show <name> a suite's resolved command per variant
- run <name> [opts] [-- ...] run one suite (the call CI + just both make)
- matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
- doctor validate the manifest (CI lints this)
+list all suites, tiers, lanes, variants
+show <name> a suite's resolved command per variant
+run <name> [opts] [-- ...] run one suite (the call CI + just both make)
+matrix [--lane|--tier] JSON matrix `include` for GitHub Actions
+doctor validate the manifest (CI lints this)
"""
from __future__ import annotations
import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name
def _cmd_list(_: argparse.Namespace) -> int:
width = max(len(s.name) for s in SUITES)
- print(f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS")
+ print(
+ f"{'SUITE'.ljust(width)} TIER LANES VARIANTS PLATFORMS"
+ )
for s in SUITES:
- print(f"{s.name.ljust(width)} {s.tier:<4} "
- f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}")
- print(f"\n{len(SUITES)} suites. "
- f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)")
+ print(
+ f"{s.name.ljust(width)} {s.tier:<4} "
+ f"{','.join(s.lanes):<21} {','.join(s.variants):<15} {','.join(s.platforms)}"
+ )
+ print(
+ f"\n{len(SUITES)} suites. "
+ f"Run one: python -m tests.ci run <suite> (or `just suite <suite>`)"
+ )
return 0
def _cmd_show(args: argparse.Namespace) -> int:
s = by_name(args.name)
@@ -39,36 +45,41 @@
def _cmd_run(args: argparse.Namespace) -> int:
s = by_name(args.name)
variants = [args.variant] if args.variant else list(s.variants)
if args.variant and args.variant not in s.variants:
- print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
- f"it runs on {s.variants}", file=sys.stderr)
+ print(
+ f"::warning::{s.name} does not run on variant {args.variant!r}; "
+ f"it runs on {s.variants}",
+ file=sys.stderr,
+ )
return 0
rc = 0
for var in variants:
rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
return rc
def _cmd_run_lane(args: argparse.Namespace) -> int:
"""Run every suite in a lane/tier, continuing past failures (so one consolidated
report sees them all). Returns non-zero if any suite failed."""
- jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ jobs = select(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not jobs:
print("::warning::no suites match the given filters", file=sys.stderr)
return 0
rc = 0
for s, var in jobs:
rc = run_suite(s, var, dry_run=args.dry_run) or rc
return rc
def _cmd_matrix(args: argparse.Namespace) -> int:
- include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
- platform=args.platform)
+ include = matrix(
+ lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+ )
if not include:
print("::warning::matrix is empty for the given filters", file=sys.stderr)
print(json.dumps({"include": include}))
return 0
@@ -109,12 +120,14 @@
if problems:
for p in problems:
print(f"✗ {p}", file=sys.stderr)
print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
return 1
- print(f"✓ manifest OK — {len(SUITES)} suites, "
- f"{len(set(junits))} unique junit paths, no collisions.")
+ print(
+ f"✓ manifest OK — {len(SUITES)} suites, "
+ f"{len(set(junits))} unique junit paths, no collisions."
+ )
return 0
def main(argv: list[str] | None = None) -> int:
p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
sp.set_defaults(fn=_cmd_show)
sp = sub.add_parser("run", help="run one suite")
sp.add_argument("name")
sp.add_argument("--variant", choices=("standard", "rtx"))
- sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
- sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
- "(use `-- -k foo`)")
+ sp.add_argument(
+ "--dry-run", action="store_true", help="print the command, don't run"
+ )
+ sp.add_argument(
+ "pytest_args",
+ nargs="*",
+ help="extra args forwarded to pytest " "(use `-- -k foo`)",
+ )
sp.set_defaults(fn=_cmd_run)
- sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+ sp = sub.add_parser(
+ "run-lane", help="run every suite in a lane/tier, past failures"
+ )
g = sp.add_mutually_exclusive_group()
g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
g.add_argument("--tier", choices=("l0", "l1", "l2"))
sp.add_argument("--variant", choices=("standard", "rtx"))
sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py 2026-07-02 22:45:58.664304+00:00
@@ -22,13 +22,18 @@
)
# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
- "--reruns", "1", "--reruns-delay", "5",
- "--only-rerun", "cudaErrorStreamCaptureInvalidated",
- "--only-rerun", "Stream capture invalidated",
+ "--reruns",
+ "1",
+ "--reruns-delay",
+ "5",
+ "--only-rerun",
+ "cudaErrorStreamCaptureInvalidated",
+ "--only-rerun",
+ "Stream capture invalidated",
]
def _launcher() -> list[str]:
"""The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
"""(argv, cwd) pairs for a named setup step."""
launcher = _launcher()
if step == "hub":
return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
if step == "executorch":
- return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
- REPO_ROOT)]
+ return [
+ (
+ launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+ REPO_ROOT,
+ )
+ ]
if step == "cuda-core":
- return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
- REPO_ROOT)]
+ return [
+ (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+ ]
if step == "mpi":
- return [(["dnf", "install", "-y", "mpich", "mpich-devel",
- "openmpi", "openmpi-devel"], REPO_ROOT)]
+ return [
+ (
+ [
+ "dnf",
+ "install",
+ "-y",
+ "mpich",
+ "mpich-devel",
+ "openmpi",
+ "openmpi-devel",
+ ],
+ REPO_ROOT,
+ )
+ ]
raise KeyError(f"unknown setup step {step!r} in a suite definition")
def describe(suite: Suite, variant: Variant) -> str:
"""The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
process exit code (non-zero on first failure), mirroring the bash tiers."""
v = suite.for_variant(variant)
extra = extra or []
env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
cwd = REPO_ROOT / v["cwd"]
- pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ pytest_cmd = (
+ _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+ )
if dry_run:
print(describe(suite, variant))
if extra:
print(f" # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
print(f"::warning::setup step {step!r} exited {rc}", flush=True)
print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
if rc != 0:
- repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
- + build_pytest_args(suite, variant) + extra)
- print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
- flush=True)
+ repro = shlex.join(
+ ["uv", "run", "--no-sync", "pytest"]
+ + build_pytest_args(suite, variant)
+ + extra
+ )
+ print(
+ f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+ flush=True,
+ )
return rc
for f in v["follow"]:
fcmd = _launcher() + list(f)
print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@
def matrix(**filters: str | None) -> list[dict[str, str]]:
"""GitHub-Actions matrix ``include`` entries for the selected jobs."""
return [
- {"suite": s.name, "variant": var, "tier": s.tier,
- "cwd": s.for_variant(var)["cwd"]}
+ {
+ "suite": s.name,
+ "variant": var,
+ "tier": s.tier,
+ "cwd": s.for_variant(var)["cwd"],
+ }
for s, var in select(**filters)
]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py 2026-07-02 22:45:58.683233+00:00
@@ -54,172 +54,286 @@
"""
name: str
tier: Tier
lanes: tuple[Lane, ...]
- cwd: str = "tests/py/dynamo" # relative to repo root
- paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
- markers: str | None = None # -m EXPR
- keyword: str | None = None # -k EXPR
- dist: str | None = None # --dist=loadscope
- maxfail: int | None = None # --maxfail=N
- ir: str | None = None # --ir torch_compile
- jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
- reruns: bool = True # wrap in the flake-rerun helper
- verbose: bool = False # -v
+ cwd: str = "tests/py/dynamo" # relative to repo root
+ paths: tuple[str, ...] = () # pytest positionals (rel to cwd); globs ok
+ markers: str | None = None # -m EXPR
+ keyword: str | None = None # -k EXPR
+ dist: str | None = None # --dist=loadscope
+ maxfail: int | None = None # --maxfail=N
+ ir: str | None = None # --ir torch_compile
+ jobs: str | None = None # xdist default: None=serial, "8"/"auto"/"4"
+ reruns: bool = True # wrap in the flake-rerun helper
+ verbose: bool = False # -v
variants: tuple[Variant, ...] = ALL_VARIANTS
- platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
- setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
- follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
+ platforms: tuple[Platform, ...] = ALL_PLATFORMS # channels this suite runs on
+ setup: tuple[str, ...] = () # named pre-steps: hub|executorch|cuda-core|mpi
+ follow: tuple[tuple[str, ...], ...] = () # extra argv to run AFTER pytest
env: dict[str, str] = field(default_factory=dict)
overrides: dict[str, dict[str, Any]] = field(default_factory=dict) # per-variant
def for_variant(self, variant: Variant) -> dict[str, Any]:
"""This suite's effective fields for ``variant`` (applies overrides)."""
- base = {f: getattr(self, f) for f in (
- "cwd", "paths", "markers", "keyword", "dist", "maxfail",
- "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
- )}
+ base = {
+ f: getattr(self, f)
+ for f in (
+ "cwd",
+ "paths",
+ "markers",
+ "keyword",
+ "dist",
+ "maxfail",
+ "ir",
+ "jobs",
+ "reruns",
+ "verbose",
+ "setup",
+ "follow",
+ "env",
+ )
+ }
base.update(self.overrides.get(variant, {}))
return base
# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
Suite(
- "dynamo-converters", tier="l0", lanes=("fast", "full"),
- paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+ "dynamo-converters",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("conversion/",),
+ dist="--dist=loadscope",
+ maxfail=20,
+ jobs="8",
# RTX does not shard converters with loadscope.
overrides={"rtx": {"dist": None}},
),
Suite(
- "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
- paths=("runtime/test_000_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
- paths=("partitioning/test_000_*",), jobs="8",
+ "dynamo-runtime-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("runtime/test_000_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning-smoke",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("partitioning/test_000_*",),
+ jobs="8",
# RTX runs the whole partitioning suite (no smoke subset split).
overrides={"rtx": {"paths": ("partitioning/",)}},
),
Suite(
- "dynamo-lowering", tier="l0", lanes=("fast", "full"),
- paths=("lowering/",), jobs="8",
- ),
- Suite(
- "py-core", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/core", paths=(".",), jobs="8",
- ),
- Suite(
- "ts-api", tier="l0", lanes=("fast", "full"),
- cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+ "dynamo-lowering",
+ tier="l0",
+ lanes=("fast", "full"),
+ paths=("lowering/",),
+ jobs="8",
+ ),
+ Suite(
+ "py-core",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/core",
+ paths=(".",),
+ jobs="8",
+ ),
+ Suite(
+ "ts-api",
+ tier="l0",
+ lanes=("fast", "full"),
+ cwd="tests/py/ts",
+ paths=("api/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
Suite(
- "dynamo-runtime", tier="l1", lanes=("full",),
- paths=("runtime/test_001_*",), jobs="8",
- ),
- Suite(
- "dynamo-partitioning", tier="l1", lanes=("full",),
- paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+ "dynamo-runtime",
+ tier="l1",
+ lanes=("full",),
+ paths=("runtime/test_001_*",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-partitioning",
+ tier="l1",
+ lanes=("full",),
+ paths=("partitioning/test_001_*",),
+ jobs="8",
+ variants=("standard",),
),
Suite(
# Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
- "dynamo-hlo", tier="l1", lanes=("full",),
- paths=("hlo/",), jobs="8",
- ),
- Suite(
- "dynamo-models-critical", tier="l1", lanes=("full",),
- paths=("models/",), markers="critical",
- ),
- Suite(
- "torch-compile-backend", tier="l1", lanes=("full",),
+ "dynamo-hlo",
+ tier="l1",
+ lanes=("full",),
+ paths=("hlo/",),
+ jobs="8",
+ ),
+ Suite(
+ "dynamo-models-critical",
+ tier="l1",
+ lanes=("full",),
+ paths=("models/",),
+ markers="critical",
+ ),
+ Suite(
+ "torch-compile-backend",
+ tier="l1",
+ lanes=("full",),
paths=("backend/",),
),
Suite(
- "torch-compile-models-critical", tier="l1", lanes=("full",),
+ "torch-compile-models-critical",
+ tier="l1",
+ lanes=("full",),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="critical", ir="torch_compile",
- ),
- Suite(
- "ts-models", tier="l1", lanes=("full",),
- cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+ markers="critical",
+ ir="torch_compile",
+ ),
+ Suite(
+ "ts-models",
+ tier="l1",
+ lanes=("full",),
+ cwd="tests/py/ts",
+ paths=("models/",),
+ setup=("hub",),
+ variants=("standard",),
),
]
# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
Suite(
- "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+ "torch-compile-models",
+ tier="l2",
+ lanes=("full", "nightly"),
paths=("models/test_models.py", "models/test_dyn_models.py"),
- markers="not critical", ir="torch_compile", jobs="auto",
- ),
- Suite(
- "dynamo-models", tier="l2", lanes=("full", "nightly"),
- paths=("models/",), markers="not critical", jobs="auto",
- ),
- Suite(
- "dynamo-llm", tier="l2", lanes=("nightly",),
- paths=("llm/",), jobs="auto",
- ),
- Suite(
- "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
- paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
- ),
- Suite(
- "executorch", tier="l2", lanes=("nightly",),
- paths=("executorch/",), setup=("executorch",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
+ markers="not critical",
+ ir="torch_compile",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-models",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("models/",),
+ markers="not critical",
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-llm",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("llm/",),
+ jobs="auto",
+ ),
+ Suite(
+ "dynamo-runtime-full",
+ tier="l2",
+ lanes=("full", "nightly"),
+ paths=("runtime/",),
+ keyword="not test_000_ and not test_001_",
+ jobs="auto",
+ ),
+ Suite(
+ "executorch",
+ tier="l2",
+ lanes=("nightly",),
+ paths=("executorch/",),
+ setup=("executorch",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
),
Suite(
# Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
# (The redundant conversion/ re-run from the old l2_plugin is dropped.)
- "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+ "plugins-automatic",
+ tier="l2",
+ lanes=("nightly",),
+ jobs="auto",
paths=(
"automatic_plugin/test_automatic_plugin.py",
"automatic_plugin/test_automatic_plugin_with_attrs.py",
"automatic_plugin/test_flashinfer_rmsnorm.py",
),
overrides={"rtx": {"paths": ("automatic_plugin/",)}},
),
Suite(
- "kernels", tier="l2", lanes=("nightly",),
- cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
- variants=("standard",), platforms=("linux-x86_64",),
- ),
- Suite(
- "ts-integrations", tier="l2", lanes=("nightly",),
- cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
- variants=("standard",),
- ),
- Suite(
- "distributed", tier="l2", lanes=("nightly",),
+ "kernels",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/kernels",
+ paths=(".",),
+ setup=("cuda-core",),
+ jobs="auto",
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ ),
+ Suite(
+ "ts-integrations",
+ tier="l2",
+ lanes=("nightly",),
+ cwd="tests/py/ts",
+ paths=("integrations/",),
+ setup=("hub",),
+ jobs="auto",
+ variants=("standard",),
+ ),
+ Suite(
+ "distributed",
+ tier="l2",
+ lanes=("nightly",),
paths=(
"distributed/test_nccl_ops.py",
"distributed/test_native_nccl.py",
"distributed/test_export_save_load.py",
),
- jobs="auto", verbose=True, reruns=False, variants=("standard",),
- platforms=("linux-x86_64",), setup=("mpi",),
+ jobs="auto",
+ verbose=True,
+ reruns=False,
+ variants=("standard",),
+ platforms=("linux-x86_64",),
+ setup=("mpi",),
env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
follow=(
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_native_nccl.py", "--multirank"),
- ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
- "distributed/test_export_save_load.py", "--multirank"),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_native_nccl.py",
+ "--multirank",
+ ),
+ (
+ "-m",
+ "torch_tensorrt.distributed.run",
+ "--nproc_per_node=2",
+ "distributed/test_export_save_load.py",
+ "--multirank",
+ ),
),
),
]
# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
Suite(
- "python-only-runtime", tier="l1", lanes=("python-only",),
- paths=("runtime/",), jobs="8", variants=("standard",),
+ "python-only-runtime",
+ tier="l1",
+ lanes=("python-only",),
+ paths=("runtime/",),
+ jobs="8",
+ variants=("standard",),
),
]
SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Description
Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
Fixes # (issue)
Type of change
Please delete options that are not relevant and/or add your own.
Checklist: