Skip to content

Narendasan/test dx#4376

Open
narendasan wants to merge 2 commits into
mainfrom
narendasan/test-dx
Open

Narendasan/test dx#4376
narendasan wants to merge 2 commits into
mainfrom
narendasan/test-dx

Conversation

@narendasan

Copy link
Copy Markdown
Collaborator

Description

Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.

Fixes # (issue)

Type of change

Please delete options that are not relevant and/or add your own.

  • Bug fix (non-breaking change which fixes an issue)
  • New feature (non-breaking change which adds functionality)
  • Breaking change (fix or feature that would cause existing functionality to not work as expected)
  • This change requires a documentation update

Checklist:

  • My code follows the style guidelines of this project (You can use the linters)
  • I have performed a self-review of my own code
  • I have commented my code, particularly in hard-to-understand areas and hacks
  • I have made corresponding changes to the documentation
  • I have added tests to verify my fix or my feature
  • New and existing unit tests pass locally with my changes
  • I have added the relevant labels to my PR in so that relevant reviewers are notified

@meta-cla meta-cla Bot added the cla signed label Jun 30, 2026
@github-actions github-actions Bot added component: tests Issues re: Tests component: build system Issues re: Build system labels Jun 30, 2026

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-06-30 21:52:03.788259+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest:  python -m tests.ci {list,show,run,matrix,doctor}

-  list                       all suites, tiers, lanes, variants
-  show <name>                a suite's resolved command per variant
-  run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
-  matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
-  doctor                     validate the manifest (CI lints this)
+list                       all suites, tiers, lanes, variants
+show <name>                a suite's resolved command per variant
+run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
+matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
+doctor                     validate the manifest (CI lints this)
"""

from __future__ import annotations

import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name


def _cmd_list(_: argparse.Namespace) -> int:
    width = max(len(s.name) for s in SUITES)
-    print(f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS")
+    print(
+        f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS"
+    )
    for s in SUITES:
-        print(f"{s.name.ljust(width)}  {s.tier:<4}  "
-              f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}")
-    print(f"\n{len(SUITES)} suites.  "
-          f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)")
+        print(
+            f"{s.name.ljust(width)}  {s.tier:<4}  "
+            f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}"
+        )
+    print(
+        f"\n{len(SUITES)} suites.  "
+        f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)"
+    )
    return 0


def _cmd_show(args: argparse.Namespace) -> int:
    s = by_name(args.name)
@@ -39,36 +45,41 @@

def _cmd_run(args: argparse.Namespace) -> int:
    s = by_name(args.name)
    variants = [args.variant] if args.variant else list(s.variants)
    if args.variant and args.variant not in s.variants:
-        print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
-              f"it runs on {s.variants}", file=sys.stderr)
+        print(
+            f"::warning::{s.name} does not run on variant {args.variant!r}; "
+            f"it runs on {s.variants}",
+            file=sys.stderr,
+        )
        return 0
    rc = 0
    for var in variants:
        rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
    return rc


def _cmd_run_lane(args: argparse.Namespace) -> int:
    """Run every suite in a lane/tier, continuing past failures (so one consolidated
    report sees them all). Returns non-zero if any suite failed."""
-    jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
-                  platform=args.platform)
+    jobs = select(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not jobs:
        print("::warning::no suites match the given filters", file=sys.stderr)
        return 0
    rc = 0
    for s, var in jobs:
        rc = run_suite(s, var, dry_run=args.dry_run) or rc
    return rc


def _cmd_matrix(args: argparse.Namespace) -> int:
-    include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
-                     platform=args.platform)
+    include = matrix(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not include:
        print("::warning::matrix is empty for the given filters", file=sys.stderr)
    print(json.dumps({"include": include}))
    return 0

@@ -109,12 +120,14 @@
    if problems:
        for p in problems:
            print(f"✗ {p}", file=sys.stderr)
        print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
        return 1
-    print(f"✓ manifest OK — {len(SUITES)} suites, "
-          f"{len(set(junits))} unique junit paths, no collisions.")
+    print(
+        f"✓ manifest OK — {len(SUITES)} suites, "
+        f"{len(set(junits))} unique junit paths, no collisions."
+    )
    return 0


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
    sp.set_defaults(fn=_cmd_show)

    sp = sub.add_parser("run", help="run one suite")
    sp.add_argument("name")
    sp.add_argument("--variant", choices=("standard", "rtx"))
-    sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
-    sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
-                    "(use `-- -k foo`)")
+    sp.add_argument(
+        "--dry-run", action="store_true", help="print the command, don't run"
+    )
+    sp.add_argument(
+        "pytest_args",
+        nargs="*",
+        help="extra args forwarded to pytest " "(use `-- -k foo`)",
+    )
    sp.set_defaults(fn=_cmd_run)

-    sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+    sp = sub.add_parser(
+        "run-lane", help="run every suite in a lane/tier, past failures"
+    )
    g = sp.add_mutually_exclusive_group()
    g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
    g.add_argument("--tier", choices=("l0", "l1", "l2"))
    sp.add_argument("--variant", choices=("standard", "rtx"))
    sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-06-30 21:52:03.844831+00:00
@@ -22,13 +22,18 @@
)

# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
-    "--reruns", "1", "--reruns-delay", "5",
-    "--only-rerun", "cudaErrorStreamCaptureInvalidated",
-    "--only-rerun", "Stream capture invalidated",
+    "--reruns",
+    "1",
+    "--reruns-delay",
+    "5",
+    "--only-rerun",
+    "cudaErrorStreamCaptureInvalidated",
+    "--only-rerun",
+    "Stream capture invalidated",
]


def _launcher() -> list[str]:
    """The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
    """(argv, cwd) pairs for a named setup step."""
    launcher = _launcher()
    if step == "hub":
        return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
    if step == "executorch":
-        return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
-                 REPO_ROOT)]
+        return [
+            (
+                launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+                REPO_ROOT,
+            )
+        ]
    if step == "cuda-core":
-        return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
-                 REPO_ROOT)]
+        return [
+            (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+        ]
    if step == "mpi":
-        return [(["dnf", "install", "-y", "mpich", "mpich-devel",
-                  "openmpi", "openmpi-devel"], REPO_ROOT)]
+        return [
+            (
+                [
+                    "dnf",
+                    "install",
+                    "-y",
+                    "mpich",
+                    "mpich-devel",
+                    "openmpi",
+                    "openmpi-devel",
+                ],
+                REPO_ROOT,
+            )
+        ]
    raise KeyError(f"unknown setup step {step!r} in a suite definition")


def describe(suite: Suite, variant: Variant) -> str:
    """The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
    process exit code (non-zero on first failure), mirroring the bash tiers."""
    v = suite.for_variant(variant)
    extra = extra or []
    env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
    cwd = REPO_ROOT / v["cwd"]
-    pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    pytest_cmd = (
+        _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    )

    if dry_run:
        print(describe(suite, variant))
        if extra:
            print(f"  # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
                print(f"::warning::setup step {step!r} exited {rc}", flush=True)

    print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
    rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
    if rc != 0:
-        repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
-                           + build_pytest_args(suite, variant) + extra)
-        print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
-              flush=True)
+        repro = shlex.join(
+            ["uv", "run", "--no-sync", "pytest"]
+            + build_pytest_args(suite, variant)
+            + extra
+        )
+        print(
+            f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+            flush=True,
+        )
        return rc

    for f in v["follow"]:
        fcmd = _launcher() + list(f)
        print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@


def matrix(**filters: str | None) -> list[dict[str, str]]:
    """GitHub-Actions matrix ``include`` entries for the selected jobs."""
    return [
-        {"suite": s.name, "variant": var, "tier": s.tier,
-         "cwd": s.for_variant(var)["cwd"]}
+        {
+            "suite": s.name,
+            "variant": var,
+            "tier": s.tier,
+            "cwd": s.for_variant(var)["cwd"],
+        }
        for s, var in select(**filters)
    ]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-06-30 21:51:41.118493+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-06-30 21:52:03.892690+00:00
@@ -54,172 +54,286 @@
    """

    name: str
    tier: Tier
    lanes: tuple[Lane, ...]
-    cwd: str = "tests/py/dynamo"          # relative to repo root
-    paths: tuple[str, ...] = ()           # pytest positionals (rel to cwd); globs ok
-    markers: str | None = None            # -m EXPR
-    keyword: str | None = None            # -k EXPR
-    dist: str | None = None               # --dist=loadscope
-    maxfail: int | None = None            # --maxfail=N
-    ir: str | None = None                 # --ir torch_compile
-    jobs: str | None = None               # xdist default: None=serial, "8"/"auto"/"4"
-    reruns: bool = True                   # wrap in the flake-rerun helper
-    verbose: bool = False                 # -v
+    cwd: str = "tests/py/dynamo"  # relative to repo root
+    paths: tuple[str, ...] = ()  # pytest positionals (rel to cwd); globs ok
+    markers: str | None = None  # -m EXPR
+    keyword: str | None = None  # -k EXPR
+    dist: str | None = None  # --dist=loadscope
+    maxfail: int | None = None  # --maxfail=N
+    ir: str | None = None  # --ir torch_compile
+    jobs: str | None = None  # xdist default: None=serial, "8"/"auto"/"4"
+    reruns: bool = True  # wrap in the flake-rerun helper
+    verbose: bool = False  # -v
    variants: tuple[Variant, ...] = ALL_VARIANTS
-    platforms: tuple[Platform, ...] = ALL_PLATFORMS   # channels this suite runs on
-    setup: tuple[str, ...] = ()           # named pre-steps: hub|executorch|cuda-core|mpi
-    follow: tuple[tuple[str, ...], ...] = ()   # extra argv to run AFTER pytest
+    platforms: tuple[Platform, ...] = ALL_PLATFORMS  # channels this suite runs on
+    setup: tuple[str, ...] = ()  # named pre-steps: hub|executorch|cuda-core|mpi
+    follow: tuple[tuple[str, ...], ...] = ()  # extra argv to run AFTER pytest
    env: dict[str, str] = field(default_factory=dict)
    overrides: dict[str, dict[str, Any]] = field(default_factory=dict)  # per-variant

    def for_variant(self, variant: Variant) -> dict[str, Any]:
        """This suite's effective fields for ``variant`` (applies overrides)."""
-        base = {f: getattr(self, f) for f in (
-            "cwd", "paths", "markers", "keyword", "dist", "maxfail",
-            "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
-        )}
+        base = {
+            f: getattr(self, f)
+            for f in (
+                "cwd",
+                "paths",
+                "markers",
+                "keyword",
+                "dist",
+                "maxfail",
+                "ir",
+                "jobs",
+                "reruns",
+                "verbose",
+                "setup",
+                "follow",
+                "env",
+            )
+        }
        base.update(self.overrides.get(variant, {}))
        return base


# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
    Suite(
-        "dynamo-converters", tier="l0", lanes=("fast", "full"),
-        paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+        "dynamo-converters",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("conversion/",),
+        dist="--dist=loadscope",
+        maxfail=20,
+        jobs="8",
        # RTX does not shard converters with loadscope.
        overrides={"rtx": {"dist": None}},
    ),
    Suite(
-        "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("runtime/test_000_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("partitioning/test_000_*",), jobs="8",
+        "dynamo-runtime-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("runtime/test_000_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("partitioning/test_000_*",),
+        jobs="8",
        # RTX runs the whole partitioning suite (no smoke subset split).
        overrides={"rtx": {"paths": ("partitioning/",)}},
    ),
    Suite(
-        "dynamo-lowering", tier="l0", lanes=("fast", "full"),
-        paths=("lowering/",), jobs="8",
-    ),
-    Suite(
-        "py-core", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/core", paths=(".",), jobs="8",
-    ),
-    Suite(
-        "ts-api", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+        "dynamo-lowering",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("lowering/",),
+        jobs="8",
+    ),
+    Suite(
+        "py-core",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/core",
+        paths=(".",),
+        jobs="8",
+    ),
+    Suite(
+        "ts-api",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/ts",
+        paths=("api/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
    Suite(
-        "dynamo-runtime", tier="l1", lanes=("full",),
-        paths=("runtime/test_001_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning", tier="l1", lanes=("full",),
-        paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+        "dynamo-runtime",
+        tier="l1",
+        lanes=("full",),
+        paths=("runtime/test_001_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning",
+        tier="l1",
+        lanes=("full",),
+        paths=("partitioning/test_001_*",),
+        jobs="8",
+        variants=("standard",),
    ),
    Suite(
        # Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
-        "dynamo-hlo", tier="l1", lanes=("full",),
-        paths=("hlo/",), jobs="8",
-    ),
-    Suite(
-        "dynamo-models-critical", tier="l1", lanes=("full",),
-        paths=("models/",), markers="critical",
-    ),
-    Suite(
-        "torch-compile-backend", tier="l1", lanes=("full",),
+        "dynamo-hlo",
+        tier="l1",
+        lanes=("full",),
+        paths=("hlo/",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-models-critical",
+        tier="l1",
+        lanes=("full",),
+        paths=("models/",),
+        markers="critical",
+    ),
+    Suite(
+        "torch-compile-backend",
+        tier="l1",
+        lanes=("full",),
        paths=("backend/",),
    ),
    Suite(
-        "torch-compile-models-critical", tier="l1", lanes=("full",),
+        "torch-compile-models-critical",
+        tier="l1",
+        lanes=("full",),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="critical", ir="torch_compile",
-    ),
-    Suite(
-        "ts-models", tier="l1", lanes=("full",),
-        cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+        markers="critical",
+        ir="torch_compile",
+    ),
+    Suite(
+        "ts-models",
+        tier="l1",
+        lanes=("full",),
+        cwd="tests/py/ts",
+        paths=("models/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
    Suite(
-        "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+        "torch-compile-models",
+        tier="l2",
+        lanes=("full", "nightly"),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="not critical", ir="torch_compile", jobs="auto",
-    ),
-    Suite(
-        "dynamo-models", tier="l2", lanes=("full", "nightly"),
-        paths=("models/",), markers="not critical", jobs="auto",
-    ),
-    Suite(
-        "dynamo-llm", tier="l2", lanes=("nightly",),
-        paths=("llm/",), jobs="auto",
-    ),
-    Suite(
-        "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
-        paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
-    ),
-    Suite(
-        "executorch", tier="l2", lanes=("nightly",),
-        paths=("executorch/",), setup=("executorch",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
+        markers="not critical",
+        ir="torch_compile",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-models",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("models/",),
+        markers="not critical",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-llm",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("llm/",),
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-runtime-full",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("runtime/",),
+        keyword="not test_000_ and not test_001_",
+        jobs="auto",
+    ),
+    Suite(
+        "executorch",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("executorch/",),
+        setup=("executorch",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
    ),
    Suite(
        # Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
        # (The redundant conversion/ re-run from the old l2_plugin is dropped.)
-        "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+        "plugins-automatic",
+        tier="l2",
+        lanes=("nightly",),
+        jobs="auto",
        paths=(
            "automatic_plugin/test_automatic_plugin.py",
            "automatic_plugin/test_automatic_plugin_with_attrs.py",
            "automatic_plugin/test_flashinfer_rmsnorm.py",
        ),
        overrides={"rtx": {"paths": ("automatic_plugin/",)}},
    ),
    Suite(
-        "kernels", tier="l2", lanes=("nightly",),
-        cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
-    ),
-    Suite(
-        "ts-integrations", tier="l2", lanes=("nightly",),
-        cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
-        variants=("standard",),
-    ),
-    Suite(
-        "distributed", tier="l2", lanes=("nightly",),
+        "kernels",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/kernels",
+        paths=(".",),
+        setup=("cuda-core",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+    ),
+    Suite(
+        "ts-integrations",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/ts",
+        paths=("integrations/",),
+        setup=("hub",),
+        jobs="auto",
+        variants=("standard",),
+    ),
+    Suite(
+        "distributed",
+        tier="l2",
+        lanes=("nightly",),
        paths=(
            "distributed/test_nccl_ops.py",
            "distributed/test_native_nccl.py",
            "distributed/test_export_save_load.py",
        ),
-        jobs="auto", verbose=True, reruns=False, variants=("standard",),
-        platforms=("linux-x86_64",), setup=("mpi",),
+        jobs="auto",
+        verbose=True,
+        reruns=False,
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+        setup=("mpi",),
        env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
        follow=(
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_native_nccl.py", "--multirank"),
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_export_save_load.py", "--multirank"),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_native_nccl.py",
+                "--multirank",
+            ),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_export_save_load.py",
+                "--multirank",
+            ),
        ),
    ),
]

# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
    Suite(
-        "python-only-runtime", tier="l1", lanes=("python-only",),
-        paths=("runtime/",), jobs="8", variants=("standard",),
+        "python-only-runtime",
+        tier="l1",
+        lanes=("python-only",),
+        paths=("runtime/",),
+        jobs="8",
+        variants=("standard",),
    ),
]

SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)

@narendasan narendasan force-pushed the narendasan/test-dx branch from 1e92f97 to b58b80a Compare July 1, 2026 18:49

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-01 18:50:01.703469+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest:  python -m tests.ci {list,show,run,matrix,doctor}

-  list                       all suites, tiers, lanes, variants
-  show <name>                a suite's resolved command per variant
-  run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
-  matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
-  doctor                     validate the manifest (CI lints this)
+list                       all suites, tiers, lanes, variants
+show <name>                a suite's resolved command per variant
+run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
+matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
+doctor                     validate the manifest (CI lints this)
"""

from __future__ import annotations

import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name


def _cmd_list(_: argparse.Namespace) -> int:
    width = max(len(s.name) for s in SUITES)
-    print(f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS")
+    print(
+        f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS"
+    )
    for s in SUITES:
-        print(f"{s.name.ljust(width)}  {s.tier:<4}  "
-              f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}")
-    print(f"\n{len(SUITES)} suites.  "
-          f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)")
+        print(
+            f"{s.name.ljust(width)}  {s.tier:<4}  "
+            f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}"
+        )
+    print(
+        f"\n{len(SUITES)} suites.  "
+        f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)"
+    )
    return 0


def _cmd_show(args: argparse.Namespace) -> int:
    s = by_name(args.name)
@@ -39,36 +45,41 @@

def _cmd_run(args: argparse.Namespace) -> int:
    s = by_name(args.name)
    variants = [args.variant] if args.variant else list(s.variants)
    if args.variant and args.variant not in s.variants:
-        print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
-              f"it runs on {s.variants}", file=sys.stderr)
+        print(
+            f"::warning::{s.name} does not run on variant {args.variant!r}; "
+            f"it runs on {s.variants}",
+            file=sys.stderr,
+        )
        return 0
    rc = 0
    for var in variants:
        rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
    return rc


def _cmd_run_lane(args: argparse.Namespace) -> int:
    """Run every suite in a lane/tier, continuing past failures (so one consolidated
    report sees them all). Returns non-zero if any suite failed."""
-    jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
-                  platform=args.platform)
+    jobs = select(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not jobs:
        print("::warning::no suites match the given filters", file=sys.stderr)
        return 0
    rc = 0
    for s, var in jobs:
        rc = run_suite(s, var, dry_run=args.dry_run) or rc
    return rc


def _cmd_matrix(args: argparse.Namespace) -> int:
-    include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
-                     platform=args.platform)
+    include = matrix(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not include:
        print("::warning::matrix is empty for the given filters", file=sys.stderr)
    print(json.dumps({"include": include}))
    return 0

@@ -109,12 +120,14 @@
    if problems:
        for p in problems:
            print(f"✗ {p}", file=sys.stderr)
        print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
        return 1
-    print(f"✓ manifest OK — {len(SUITES)} suites, "
-          f"{len(set(junits))} unique junit paths, no collisions.")
+    print(
+        f"✓ manifest OK — {len(SUITES)} suites, "
+        f"{len(set(junits))} unique junit paths, no collisions."
+    )
    return 0


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
    sp.set_defaults(fn=_cmd_show)

    sp = sub.add_parser("run", help="run one suite")
    sp.add_argument("name")
    sp.add_argument("--variant", choices=("standard", "rtx"))
-    sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
-    sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
-                    "(use `-- -k foo`)")
+    sp.add_argument(
+        "--dry-run", action="store_true", help="print the command, don't run"
+    )
+    sp.add_argument(
+        "pytest_args",
+        nargs="*",
+        help="extra args forwarded to pytest " "(use `-- -k foo`)",
+    )
    sp.set_defaults(fn=_cmd_run)

-    sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+    sp = sub.add_parser(
+        "run-lane", help="run every suite in a lane/tier, past failures"
+    )
    g = sp.add_mutually_exclusive_group()
    g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
    g.add_argument("--tier", choices=("l0", "l1", "l2"))
    sp.add_argument("--variant", choices=("standard", "rtx"))
    sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-01 18:50:01.775392+00:00
@@ -22,13 +22,18 @@
)

# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
-    "--reruns", "1", "--reruns-delay", "5",
-    "--only-rerun", "cudaErrorStreamCaptureInvalidated",
-    "--only-rerun", "Stream capture invalidated",
+    "--reruns",
+    "1",
+    "--reruns-delay",
+    "5",
+    "--only-rerun",
+    "cudaErrorStreamCaptureInvalidated",
+    "--only-rerun",
+    "Stream capture invalidated",
]


def _launcher() -> list[str]:
    """The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
    """(argv, cwd) pairs for a named setup step."""
    launcher = _launcher()
    if step == "hub":
        return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
    if step == "executorch":
-        return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
-                 REPO_ROOT)]
+        return [
+            (
+                launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+                REPO_ROOT,
+            )
+        ]
    if step == "cuda-core":
-        return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
-                 REPO_ROOT)]
+        return [
+            (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+        ]
    if step == "mpi":
-        return [(["dnf", "install", "-y", "mpich", "mpich-devel",
-                  "openmpi", "openmpi-devel"], REPO_ROOT)]
+        return [
+            (
+                [
+                    "dnf",
+                    "install",
+                    "-y",
+                    "mpich",
+                    "mpich-devel",
+                    "openmpi",
+                    "openmpi-devel",
+                ],
+                REPO_ROOT,
+            )
+        ]
    raise KeyError(f"unknown setup step {step!r} in a suite definition")


def describe(suite: Suite, variant: Variant) -> str:
    """The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
    process exit code (non-zero on first failure), mirroring the bash tiers."""
    v = suite.for_variant(variant)
    extra = extra or []
    env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
    cwd = REPO_ROOT / v["cwd"]
-    pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    pytest_cmd = (
+        _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    )

    if dry_run:
        print(describe(suite, variant))
        if extra:
            print(f"  # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
                print(f"::warning::setup step {step!r} exited {rc}", flush=True)

    print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
    rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
    if rc != 0:
-        repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
-                           + build_pytest_args(suite, variant) + extra)
-        print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
-              flush=True)
+        repro = shlex.join(
+            ["uv", "run", "--no-sync", "pytest"]
+            + build_pytest_args(suite, variant)
+            + extra
+        )
+        print(
+            f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+            flush=True,
+        )
        return rc

    for f in v["follow"]:
        fcmd = _launcher() + list(f)
        print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@


def matrix(**filters: str | None) -> list[dict[str, str]]:
    """GitHub-Actions matrix ``include`` entries for the selected jobs."""
    return [
-        {"suite": s.name, "variant": var, "tier": s.tier,
-         "cwd": s.for_variant(var)["cwd"]}
+        {
+            "suite": s.name,
+            "variant": var,
+            "tier": s.tier,
+            "cwd": s.for_variant(var)["cwd"],
+        }
        for s, var in select(**filters)
    ]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-01 18:49:37.400379+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-01 18:50:01.849014+00:00
@@ -54,172 +54,286 @@
    """

    name: str
    tier: Tier
    lanes: tuple[Lane, ...]
-    cwd: str = "tests/py/dynamo"          # relative to repo root
-    paths: tuple[str, ...] = ()           # pytest positionals (rel to cwd); globs ok
-    markers: str | None = None            # -m EXPR
-    keyword: str | None = None            # -k EXPR
-    dist: str | None = None               # --dist=loadscope
-    maxfail: int | None = None            # --maxfail=N
-    ir: str | None = None                 # --ir torch_compile
-    jobs: str | None = None               # xdist default: None=serial, "8"/"auto"/"4"
-    reruns: bool = True                   # wrap in the flake-rerun helper
-    verbose: bool = False                 # -v
+    cwd: str = "tests/py/dynamo"  # relative to repo root
+    paths: tuple[str, ...] = ()  # pytest positionals (rel to cwd); globs ok
+    markers: str | None = None  # -m EXPR
+    keyword: str | None = None  # -k EXPR
+    dist: str | None = None  # --dist=loadscope
+    maxfail: int | None = None  # --maxfail=N
+    ir: str | None = None  # --ir torch_compile
+    jobs: str | None = None  # xdist default: None=serial, "8"/"auto"/"4"
+    reruns: bool = True  # wrap in the flake-rerun helper
+    verbose: bool = False  # -v
    variants: tuple[Variant, ...] = ALL_VARIANTS
-    platforms: tuple[Platform, ...] = ALL_PLATFORMS   # channels this suite runs on
-    setup: tuple[str, ...] = ()           # named pre-steps: hub|executorch|cuda-core|mpi
-    follow: tuple[tuple[str, ...], ...] = ()   # extra argv to run AFTER pytest
+    platforms: tuple[Platform, ...] = ALL_PLATFORMS  # channels this suite runs on
+    setup: tuple[str, ...] = ()  # named pre-steps: hub|executorch|cuda-core|mpi
+    follow: tuple[tuple[str, ...], ...] = ()  # extra argv to run AFTER pytest
    env: dict[str, str] = field(default_factory=dict)
    overrides: dict[str, dict[str, Any]] = field(default_factory=dict)  # per-variant

    def for_variant(self, variant: Variant) -> dict[str, Any]:
        """This suite's effective fields for ``variant`` (applies overrides)."""
-        base = {f: getattr(self, f) for f in (
-            "cwd", "paths", "markers", "keyword", "dist", "maxfail",
-            "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
-        )}
+        base = {
+            f: getattr(self, f)
+            for f in (
+                "cwd",
+                "paths",
+                "markers",
+                "keyword",
+                "dist",
+                "maxfail",
+                "ir",
+                "jobs",
+                "reruns",
+                "verbose",
+                "setup",
+                "follow",
+                "env",
+            )
+        }
        base.update(self.overrides.get(variant, {}))
        return base


# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
    Suite(
-        "dynamo-converters", tier="l0", lanes=("fast", "full"),
-        paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+        "dynamo-converters",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("conversion/",),
+        dist="--dist=loadscope",
+        maxfail=20,
+        jobs="8",
        # RTX does not shard converters with loadscope.
        overrides={"rtx": {"dist": None}},
    ),
    Suite(
-        "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("runtime/test_000_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("partitioning/test_000_*",), jobs="8",
+        "dynamo-runtime-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("runtime/test_000_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("partitioning/test_000_*",),
+        jobs="8",
        # RTX runs the whole partitioning suite (no smoke subset split).
        overrides={"rtx": {"paths": ("partitioning/",)}},
    ),
    Suite(
-        "dynamo-lowering", tier="l0", lanes=("fast", "full"),
-        paths=("lowering/",), jobs="8",
-    ),
-    Suite(
-        "py-core", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/core", paths=(".",), jobs="8",
-    ),
-    Suite(
-        "ts-api", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+        "dynamo-lowering",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("lowering/",),
+        jobs="8",
+    ),
+    Suite(
+        "py-core",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/core",
+        paths=(".",),
+        jobs="8",
+    ),
+    Suite(
+        "ts-api",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/ts",
+        paths=("api/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
    Suite(
-        "dynamo-runtime", tier="l1", lanes=("full",),
-        paths=("runtime/test_001_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning", tier="l1", lanes=("full",),
-        paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+        "dynamo-runtime",
+        tier="l1",
+        lanes=("full",),
+        paths=("runtime/test_001_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning",
+        tier="l1",
+        lanes=("full",),
+        paths=("partitioning/test_001_*",),
+        jobs="8",
+        variants=("standard",),
    ),
    Suite(
        # Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
-        "dynamo-hlo", tier="l1", lanes=("full",),
-        paths=("hlo/",), jobs="8",
-    ),
-    Suite(
-        "dynamo-models-critical", tier="l1", lanes=("full",),
-        paths=("models/",), markers="critical",
-    ),
-    Suite(
-        "torch-compile-backend", tier="l1", lanes=("full",),
+        "dynamo-hlo",
+        tier="l1",
+        lanes=("full",),
+        paths=("hlo/",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-models-critical",
+        tier="l1",
+        lanes=("full",),
+        paths=("models/",),
+        markers="critical",
+    ),
+    Suite(
+        "torch-compile-backend",
+        tier="l1",
+        lanes=("full",),
        paths=("backend/",),
    ),
    Suite(
-        "torch-compile-models-critical", tier="l1", lanes=("full",),
+        "torch-compile-models-critical",
+        tier="l1",
+        lanes=("full",),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="critical", ir="torch_compile",
-    ),
-    Suite(
-        "ts-models", tier="l1", lanes=("full",),
-        cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+        markers="critical",
+        ir="torch_compile",
+    ),
+    Suite(
+        "ts-models",
+        tier="l1",
+        lanes=("full",),
+        cwd="tests/py/ts",
+        paths=("models/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
    Suite(
-        "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+        "torch-compile-models",
+        tier="l2",
+        lanes=("full", "nightly"),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="not critical", ir="torch_compile", jobs="auto",
-    ),
-    Suite(
-        "dynamo-models", tier="l2", lanes=("full", "nightly"),
-        paths=("models/",), markers="not critical", jobs="auto",
-    ),
-    Suite(
-        "dynamo-llm", tier="l2", lanes=("nightly",),
-        paths=("llm/",), jobs="auto",
-    ),
-    Suite(
-        "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
-        paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
-    ),
-    Suite(
-        "executorch", tier="l2", lanes=("nightly",),
-        paths=("executorch/",), setup=("executorch",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
+        markers="not critical",
+        ir="torch_compile",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-models",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("models/",),
+        markers="not critical",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-llm",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("llm/",),
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-runtime-full",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("runtime/",),
+        keyword="not test_000_ and not test_001_",
+        jobs="auto",
+    ),
+    Suite(
+        "executorch",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("executorch/",),
+        setup=("executorch",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
    ),
    Suite(
        # Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
        # (The redundant conversion/ re-run from the old l2_plugin is dropped.)
-        "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+        "plugins-automatic",
+        tier="l2",
+        lanes=("nightly",),
+        jobs="auto",
        paths=(
            "automatic_plugin/test_automatic_plugin.py",
            "automatic_plugin/test_automatic_plugin_with_attrs.py",
            "automatic_plugin/test_flashinfer_rmsnorm.py",
        ),
        overrides={"rtx": {"paths": ("automatic_plugin/",)}},
    ),
    Suite(
-        "kernels", tier="l2", lanes=("nightly",),
-        cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
-    ),
-    Suite(
-        "ts-integrations", tier="l2", lanes=("nightly",),
-        cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
-        variants=("standard",),
-    ),
-    Suite(
-        "distributed", tier="l2", lanes=("nightly",),
+        "kernels",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/kernels",
+        paths=(".",),
+        setup=("cuda-core",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+    ),
+    Suite(
+        "ts-integrations",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/ts",
+        paths=("integrations/",),
+        setup=("hub",),
+        jobs="auto",
+        variants=("standard",),
+    ),
+    Suite(
+        "distributed",
+        tier="l2",
+        lanes=("nightly",),
        paths=(
            "distributed/test_nccl_ops.py",
            "distributed/test_native_nccl.py",
            "distributed/test_export_save_load.py",
        ),
-        jobs="auto", verbose=True, reruns=False, variants=("standard",),
-        platforms=("linux-x86_64",), setup=("mpi",),
+        jobs="auto",
+        verbose=True,
+        reruns=False,
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+        setup=("mpi",),
        env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
        follow=(
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_native_nccl.py", "--multirank"),
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_export_save_load.py", "--multirank"),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_native_nccl.py",
+                "--multirank",
+            ),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_export_save_load.py",
+                "--multirank",
+            ),
        ),
    ),
]

# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
    Suite(
-        "python-only-runtime", tier="l1", lanes=("python-only",),
-        paths=("runtime/",), jobs="8", variants=("standard",),
+        "python-only-runtime",
+        tier="l1",
+        lanes=("python-only",),
+        paths=("runtime/",),
+        jobs="8",
+        variants=("standard",),
    ),
]

SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)

@narendasan narendasan force-pushed the narendasan/test-dx branch from b58b80a to 8e99636 Compare July 1, 2026 19:09

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-01 19:10:36.520231+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest:  python -m tests.ci {list,show,run,matrix,doctor}

-  list                       all suites, tiers, lanes, variants
-  show <name>                a suite's resolved command per variant
-  run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
-  matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
-  doctor                     validate the manifest (CI lints this)
+list                       all suites, tiers, lanes, variants
+show <name>                a suite's resolved command per variant
+run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
+matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
+doctor                     validate the manifest (CI lints this)
"""

from __future__ import annotations

import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name


def _cmd_list(_: argparse.Namespace) -> int:
    width = max(len(s.name) for s in SUITES)
-    print(f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS")
+    print(
+        f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS"
+    )
    for s in SUITES:
-        print(f"{s.name.ljust(width)}  {s.tier:<4}  "
-              f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}")
-    print(f"\n{len(SUITES)} suites.  "
-          f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)")
+        print(
+            f"{s.name.ljust(width)}  {s.tier:<4}  "
+            f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}"
+        )
+    print(
+        f"\n{len(SUITES)} suites.  "
+        f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)"
+    )
    return 0


def _cmd_show(args: argparse.Namespace) -> int:
    s = by_name(args.name)
@@ -39,36 +45,41 @@

def _cmd_run(args: argparse.Namespace) -> int:
    s = by_name(args.name)
    variants = [args.variant] if args.variant else list(s.variants)
    if args.variant and args.variant not in s.variants:
-        print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
-              f"it runs on {s.variants}", file=sys.stderr)
+        print(
+            f"::warning::{s.name} does not run on variant {args.variant!r}; "
+            f"it runs on {s.variants}",
+            file=sys.stderr,
+        )
        return 0
    rc = 0
    for var in variants:
        rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
    return rc


def _cmd_run_lane(args: argparse.Namespace) -> int:
    """Run every suite in a lane/tier, continuing past failures (so one consolidated
    report sees them all). Returns non-zero if any suite failed."""
-    jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
-                  platform=args.platform)
+    jobs = select(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not jobs:
        print("::warning::no suites match the given filters", file=sys.stderr)
        return 0
    rc = 0
    for s, var in jobs:
        rc = run_suite(s, var, dry_run=args.dry_run) or rc
    return rc


def _cmd_matrix(args: argparse.Namespace) -> int:
-    include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
-                     platform=args.platform)
+    include = matrix(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not include:
        print("::warning::matrix is empty for the given filters", file=sys.stderr)
    print(json.dumps({"include": include}))
    return 0

@@ -109,12 +120,14 @@
    if problems:
        for p in problems:
            print(f"✗ {p}", file=sys.stderr)
        print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
        return 1
-    print(f"✓ manifest OK — {len(SUITES)} suites, "
-          f"{len(set(junits))} unique junit paths, no collisions.")
+    print(
+        f"✓ manifest OK — {len(SUITES)} suites, "
+        f"{len(set(junits))} unique junit paths, no collisions."
+    )
    return 0


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
    sp.set_defaults(fn=_cmd_show)

    sp = sub.add_parser("run", help="run one suite")
    sp.add_argument("name")
    sp.add_argument("--variant", choices=("standard", "rtx"))
-    sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
-    sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
-                    "(use `-- -k foo`)")
+    sp.add_argument(
+        "--dry-run", action="store_true", help="print the command, don't run"
+    )
+    sp.add_argument(
+        "pytest_args",
+        nargs="*",
+        help="extra args forwarded to pytest " "(use `-- -k foo`)",
+    )
    sp.set_defaults(fn=_cmd_run)

-    sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+    sp = sub.add_parser(
+        "run-lane", help="run every suite in a lane/tier, past failures"
+    )
    g = sp.add_mutually_exclusive_group()
    g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
    g.add_argument("--tier", choices=("l0", "l1", "l2"))
    sp.add_argument("--variant", choices=("standard", "rtx"))
    sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-01 19:10:36.590423+00:00
@@ -22,13 +22,18 @@
)

# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
-    "--reruns", "1", "--reruns-delay", "5",
-    "--only-rerun", "cudaErrorStreamCaptureInvalidated",
-    "--only-rerun", "Stream capture invalidated",
+    "--reruns",
+    "1",
+    "--reruns-delay",
+    "5",
+    "--only-rerun",
+    "cudaErrorStreamCaptureInvalidated",
+    "--only-rerun",
+    "Stream capture invalidated",
]


def _launcher() -> list[str]:
    """The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
    """(argv, cwd) pairs for a named setup step."""
    launcher = _launcher()
    if step == "hub":
        return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
    if step == "executorch":
-        return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
-                 REPO_ROOT)]
+        return [
+            (
+                launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+                REPO_ROOT,
+            )
+        ]
    if step == "cuda-core":
-        return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
-                 REPO_ROOT)]
+        return [
+            (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+        ]
    if step == "mpi":
-        return [(["dnf", "install", "-y", "mpich", "mpich-devel",
-                  "openmpi", "openmpi-devel"], REPO_ROOT)]
+        return [
+            (
+                [
+                    "dnf",
+                    "install",
+                    "-y",
+                    "mpich",
+                    "mpich-devel",
+                    "openmpi",
+                    "openmpi-devel",
+                ],
+                REPO_ROOT,
+            )
+        ]
    raise KeyError(f"unknown setup step {step!r} in a suite definition")


def describe(suite: Suite, variant: Variant) -> str:
    """The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
    process exit code (non-zero on first failure), mirroring the bash tiers."""
    v = suite.for_variant(variant)
    extra = extra or []
    env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
    cwd = REPO_ROOT / v["cwd"]
-    pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    pytest_cmd = (
+        _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    )

    if dry_run:
        print(describe(suite, variant))
        if extra:
            print(f"  # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
                print(f"::warning::setup step {step!r} exited {rc}", flush=True)

    print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
    rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
    if rc != 0:
-        repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
-                           + build_pytest_args(suite, variant) + extra)
-        print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
-              flush=True)
+        repro = shlex.join(
+            ["uv", "run", "--no-sync", "pytest"]
+            + build_pytest_args(suite, variant)
+            + extra
+        )
+        print(
+            f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+            flush=True,
+        )
        return rc

    for f in v["follow"]:
        fcmd = _launcher() + list(f)
        print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@


def matrix(**filters: str | None) -> list[dict[str, str]]:
    """GitHub-Actions matrix ``include`` entries for the selected jobs."""
    return [
-        {"suite": s.name, "variant": var, "tier": s.tier,
-         "cwd": s.for_variant(var)["cwd"]}
+        {
+            "suite": s.name,
+            "variant": var,
+            "tier": s.tier,
+            "cwd": s.for_variant(var)["cwd"],
+        }
        for s, var in select(**filters)
    ]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-01 19:10:08.640502+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-01 19:10:36.641696+00:00
@@ -54,172 +54,286 @@
    """

    name: str
    tier: Tier
    lanes: tuple[Lane, ...]
-    cwd: str = "tests/py/dynamo"          # relative to repo root
-    paths: tuple[str, ...] = ()           # pytest positionals (rel to cwd); globs ok
-    markers: str | None = None            # -m EXPR
-    keyword: str | None = None            # -k EXPR
-    dist: str | None = None               # --dist=loadscope
-    maxfail: int | None = None            # --maxfail=N
-    ir: str | None = None                 # --ir torch_compile
-    jobs: str | None = None               # xdist default: None=serial, "8"/"auto"/"4"
-    reruns: bool = True                   # wrap in the flake-rerun helper
-    verbose: bool = False                 # -v
+    cwd: str = "tests/py/dynamo"  # relative to repo root
+    paths: tuple[str, ...] = ()  # pytest positionals (rel to cwd); globs ok
+    markers: str | None = None  # -m EXPR
+    keyword: str | None = None  # -k EXPR
+    dist: str | None = None  # --dist=loadscope
+    maxfail: int | None = None  # --maxfail=N
+    ir: str | None = None  # --ir torch_compile
+    jobs: str | None = None  # xdist default: None=serial, "8"/"auto"/"4"
+    reruns: bool = True  # wrap in the flake-rerun helper
+    verbose: bool = False  # -v
    variants: tuple[Variant, ...] = ALL_VARIANTS
-    platforms: tuple[Platform, ...] = ALL_PLATFORMS   # channels this suite runs on
-    setup: tuple[str, ...] = ()           # named pre-steps: hub|executorch|cuda-core|mpi
-    follow: tuple[tuple[str, ...], ...] = ()   # extra argv to run AFTER pytest
+    platforms: tuple[Platform, ...] = ALL_PLATFORMS  # channels this suite runs on
+    setup: tuple[str, ...] = ()  # named pre-steps: hub|executorch|cuda-core|mpi
+    follow: tuple[tuple[str, ...], ...] = ()  # extra argv to run AFTER pytest
    env: dict[str, str] = field(default_factory=dict)
    overrides: dict[str, dict[str, Any]] = field(default_factory=dict)  # per-variant

    def for_variant(self, variant: Variant) -> dict[str, Any]:
        """This suite's effective fields for ``variant`` (applies overrides)."""
-        base = {f: getattr(self, f) for f in (
-            "cwd", "paths", "markers", "keyword", "dist", "maxfail",
-            "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
-        )}
+        base = {
+            f: getattr(self, f)
+            for f in (
+                "cwd",
+                "paths",
+                "markers",
+                "keyword",
+                "dist",
+                "maxfail",
+                "ir",
+                "jobs",
+                "reruns",
+                "verbose",
+                "setup",
+                "follow",
+                "env",
+            )
+        }
        base.update(self.overrides.get(variant, {}))
        return base


# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
    Suite(
-        "dynamo-converters", tier="l0", lanes=("fast", "full"),
-        paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+        "dynamo-converters",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("conversion/",),
+        dist="--dist=loadscope",
+        maxfail=20,
+        jobs="8",
        # RTX does not shard converters with loadscope.
        overrides={"rtx": {"dist": None}},
    ),
    Suite(
-        "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("runtime/test_000_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("partitioning/test_000_*",), jobs="8",
+        "dynamo-runtime-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("runtime/test_000_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("partitioning/test_000_*",),
+        jobs="8",
        # RTX runs the whole partitioning suite (no smoke subset split).
        overrides={"rtx": {"paths": ("partitioning/",)}},
    ),
    Suite(
-        "dynamo-lowering", tier="l0", lanes=("fast", "full"),
-        paths=("lowering/",), jobs="8",
-    ),
-    Suite(
-        "py-core", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/core", paths=(".",), jobs="8",
-    ),
-    Suite(
-        "ts-api", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+        "dynamo-lowering",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("lowering/",),
+        jobs="8",
+    ),
+    Suite(
+        "py-core",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/core",
+        paths=(".",),
+        jobs="8",
+    ),
+    Suite(
+        "ts-api",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/ts",
+        paths=("api/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
    Suite(
-        "dynamo-runtime", tier="l1", lanes=("full",),
-        paths=("runtime/test_001_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning", tier="l1", lanes=("full",),
-        paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+        "dynamo-runtime",
+        tier="l1",
+        lanes=("full",),
+        paths=("runtime/test_001_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning",
+        tier="l1",
+        lanes=("full",),
+        paths=("partitioning/test_001_*",),
+        jobs="8",
+        variants=("standard",),
    ),
    Suite(
        # Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
-        "dynamo-hlo", tier="l1", lanes=("full",),
-        paths=("hlo/",), jobs="8",
-    ),
-    Suite(
-        "dynamo-models-critical", tier="l1", lanes=("full",),
-        paths=("models/",), markers="critical",
-    ),
-    Suite(
-        "torch-compile-backend", tier="l1", lanes=("full",),
+        "dynamo-hlo",
+        tier="l1",
+        lanes=("full",),
+        paths=("hlo/",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-models-critical",
+        tier="l1",
+        lanes=("full",),
+        paths=("models/",),
+        markers="critical",
+    ),
+    Suite(
+        "torch-compile-backend",
+        tier="l1",
+        lanes=("full",),
        paths=("backend/",),
    ),
    Suite(
-        "torch-compile-models-critical", tier="l1", lanes=("full",),
+        "torch-compile-models-critical",
+        tier="l1",
+        lanes=("full",),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="critical", ir="torch_compile",
-    ),
-    Suite(
-        "ts-models", tier="l1", lanes=("full",),
-        cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+        markers="critical",
+        ir="torch_compile",
+    ),
+    Suite(
+        "ts-models",
+        tier="l1",
+        lanes=("full",),
+        cwd="tests/py/ts",
+        paths=("models/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
    Suite(
-        "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+        "torch-compile-models",
+        tier="l2",
+        lanes=("full", "nightly"),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="not critical", ir="torch_compile", jobs="auto",
-    ),
-    Suite(
-        "dynamo-models", tier="l2", lanes=("full", "nightly"),
-        paths=("models/",), markers="not critical", jobs="auto",
-    ),
-    Suite(
-        "dynamo-llm", tier="l2", lanes=("nightly",),
-        paths=("llm/",), jobs="auto",
-    ),
-    Suite(
-        "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
-        paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
-    ),
-    Suite(
-        "executorch", tier="l2", lanes=("nightly",),
-        paths=("executorch/",), setup=("executorch",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
+        markers="not critical",
+        ir="torch_compile",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-models",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("models/",),
+        markers="not critical",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-llm",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("llm/",),
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-runtime-full",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("runtime/",),
+        keyword="not test_000_ and not test_001_",
+        jobs="auto",
+    ),
+    Suite(
+        "executorch",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("executorch/",),
+        setup=("executorch",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
    ),
    Suite(
        # Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
        # (The redundant conversion/ re-run from the old l2_plugin is dropped.)
-        "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+        "plugins-automatic",
+        tier="l2",
+        lanes=("nightly",),
+        jobs="auto",
        paths=(
            "automatic_plugin/test_automatic_plugin.py",
            "automatic_plugin/test_automatic_plugin_with_attrs.py",
            "automatic_plugin/test_flashinfer_rmsnorm.py",
        ),
        overrides={"rtx": {"paths": ("automatic_plugin/",)}},
    ),
    Suite(
-        "kernels", tier="l2", lanes=("nightly",),
-        cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
-    ),
-    Suite(
-        "ts-integrations", tier="l2", lanes=("nightly",),
-        cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
-        variants=("standard",),
-    ),
-    Suite(
-        "distributed", tier="l2", lanes=("nightly",),
+        "kernels",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/kernels",
+        paths=(".",),
+        setup=("cuda-core",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+    ),
+    Suite(
+        "ts-integrations",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/ts",
+        paths=("integrations/",),
+        setup=("hub",),
+        jobs="auto",
+        variants=("standard",),
+    ),
+    Suite(
+        "distributed",
+        tier="l2",
+        lanes=("nightly",),
        paths=(
            "distributed/test_nccl_ops.py",
            "distributed/test_native_nccl.py",
            "distributed/test_export_save_load.py",
        ),
-        jobs="auto", verbose=True, reruns=False, variants=("standard",),
-        platforms=("linux-x86_64",), setup=("mpi",),
+        jobs="auto",
+        verbose=True,
+        reruns=False,
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+        setup=("mpi",),
        env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
        follow=(
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_native_nccl.py", "--multirank"),
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_export_save_load.py", "--multirank"),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_native_nccl.py",
+                "--multirank",
+            ),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_export_save_load.py",
+                "--multirank",
+            ),
        ),
    ),
]

# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
    Suite(
-        "python-only-runtime", tier="l1", lanes=("python-only",),
-        paths=("runtime/",), jobs="8", variants=("standard",),
+        "python-only-runtime",
+        tier="l1",
+        lanes=("python-only",),
+        paths=("runtime/",),
+        jobs="8",
+        variants=("standard",),
    ),
]

SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)

…esign

Add tests/ci/{suites,runner,__main__}.py — a typed, declarative suite manifest
plus one runner (`python -m tests.ci`) that replaces the trt_tier_* bash
functions as the source of truth for what each test job runs. Faithfully
reproduces today's tier selection (verified via --dry-run) while fixing the
junit-path collisions, the double-run of hlo/, and giving every suite the
uniform flake-rerun+repro wrapper.

- pyproject: register smoke/flaky markers; norecursedirs += ci
- justfile: doctor / suites / suite / lane / report recipes (thin callers)
- TESTING_AND_CI_DESIGN.md: full local + CI design (manifest+runner, lanes
  without merge queue, GHA-cache caching, aggregated agent-friendly reports)
@narendasan narendasan force-pushed the narendasan/test-dx branch from da667cd to 58adfb6 Compare July 2, 2026 21:16
ci(test-dx): grant id-token: write at ci.yml top level

The reusable build jobs (build_linux/build_windows) request 'id-token: write'
for OIDC; a reusable workflow can't exceed its caller's permission ceiling, so
ci.yml's top-level 'contents: read' made GitHub reject it (startup_failure:
"nested job 'build' is requesting 'id-token: write', but is only allowed
'id-token: none'"). Grant id-token: write + contents: read, matching the
original per-platform entry workflows.
@narendasan narendasan force-pushed the narendasan/test-dx branch from 58adfb6 to 4577a82 Compare July 2, 2026 21:23
@github-actions github-actions Bot requested a review from lanluo-nvidia July 2, 2026 22:09

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some changes that do not conform to Python style guidelines:

--- /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/__main__.py	2026-07-02 22:45:58.621201+00:00
@@ -1,12 +1,12 @@
"""CLI for the test-suite manifest:  python -m tests.ci {list,show,run,matrix,doctor}

-  list                       all suites, tiers, lanes, variants
-  show <name>                a suite's resolved command per variant
-  run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
-  matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
-  doctor                     validate the manifest (CI lints this)
+list                       all suites, tiers, lanes, variants
+show <name>                a suite's resolved command per variant
+run <name> [opts] [-- ...]  run one suite (the call CI + just both make)
+matrix [--lane|--tier]     JSON matrix `include` for GitHub Actions
+doctor                     validate the manifest (CI lints this)
"""

from __future__ import annotations

import argparse
@@ -17,16 +17,22 @@
from .suites import SUITES, by_name


def _cmd_list(_: argparse.Namespace) -> int:
    width = max(len(s.name) for s in SUITES)
-    print(f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS")
+    print(
+        f"{'SUITE'.ljust(width)}  TIER  LANES                  VARIANTS         PLATFORMS"
+    )
    for s in SUITES:
-        print(f"{s.name.ljust(width)}  {s.tier:<4}  "
-              f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}")
-    print(f"\n{len(SUITES)} suites.  "
-          f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)")
+        print(
+            f"{s.name.ljust(width)}  {s.tier:<4}  "
+            f"{','.join(s.lanes):<21}  {','.join(s.variants):<15}  {','.join(s.platforms)}"
+        )
+    print(
+        f"\n{len(SUITES)} suites.  "
+        f"Run one:  python -m tests.ci run <suite>  (or `just suite <suite>`)"
+    )
    return 0


def _cmd_show(args: argparse.Namespace) -> int:
    s = by_name(args.name)
@@ -39,36 +45,41 @@

def _cmd_run(args: argparse.Namespace) -> int:
    s = by_name(args.name)
    variants = [args.variant] if args.variant else list(s.variants)
    if args.variant and args.variant not in s.variants:
-        print(f"::warning::{s.name} does not run on variant {args.variant!r}; "
-              f"it runs on {s.variants}", file=sys.stderr)
+        print(
+            f"::warning::{s.name} does not run on variant {args.variant!r}; "
+            f"it runs on {s.variants}",
+            file=sys.stderr,
+        )
        return 0
    rc = 0
    for var in variants:
        rc = run_suite(s, var, dry_run=args.dry_run, extra=args.pytest_args) or rc
    return rc


def _cmd_run_lane(args: argparse.Namespace) -> int:
    """Run every suite in a lane/tier, continuing past failures (so one consolidated
    report sees them all). Returns non-zero if any suite failed."""
-    jobs = select(lane=args.lane, tier=args.tier, variant=args.variant,
-                  platform=args.platform)
+    jobs = select(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not jobs:
        print("::warning::no suites match the given filters", file=sys.stderr)
        return 0
    rc = 0
    for s, var in jobs:
        rc = run_suite(s, var, dry_run=args.dry_run) or rc
    return rc


def _cmd_matrix(args: argparse.Namespace) -> int:
-    include = matrix(lane=args.lane, tier=args.tier, variant=args.variant,
-                     platform=args.platform)
+    include = matrix(
+        lane=args.lane, tier=args.tier, variant=args.variant, platform=args.platform
+    )
    if not include:
        print("::warning::matrix is empty for the given filters", file=sys.stderr)
    print(json.dumps({"include": include}))
    return 0

@@ -109,12 +120,14 @@
    if problems:
        for p in problems:
            print(f"✗ {p}", file=sys.stderr)
        print(f"\n{len(problems)} manifest problem(s).", file=sys.stderr)
        return 1
-    print(f"✓ manifest OK — {len(SUITES)} suites, "
-          f"{len(set(junits))} unique junit paths, no collisions.")
+    print(
+        f"✓ manifest OK — {len(SUITES)} suites, "
+        f"{len(set(junits))} unique junit paths, no collisions."
+    )
    return 0


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(prog="python -m tests.ci", description=__doc__)
@@ -127,16 +140,23 @@
    sp.set_defaults(fn=_cmd_show)

    sp = sub.add_parser("run", help="run one suite")
    sp.add_argument("name")
    sp.add_argument("--variant", choices=("standard", "rtx"))
-    sp.add_argument("--dry-run", action="store_true", help="print the command, don't run")
-    sp.add_argument("pytest_args", nargs="*", help="extra args forwarded to pytest "
-                    "(use `-- -k foo`)")
+    sp.add_argument(
+        "--dry-run", action="store_true", help="print the command, don't run"
+    )
+    sp.add_argument(
+        "pytest_args",
+        nargs="*",
+        help="extra args forwarded to pytest " "(use `-- -k foo`)",
+    )
    sp.set_defaults(fn=_cmd_run)

-    sp = sub.add_parser("run-lane", help="run every suite in a lane/tier, past failures")
+    sp = sub.add_parser(
+        "run-lane", help="run every suite in a lane/tier, past failures"
+    )
    g = sp.add_mutually_exclusive_group()
    g.add_argument("--lane", choices=("fast", "full", "nightly", "python-only"))
    g.add_argument("--tier", choices=("l0", "l1", "l2"))
    sp.add_argument("--variant", choices=("standard", "rtx"))
    sp.add_argument("--platform", choices=("linux-x86_64", "windows"))
--- /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/runner.py	2026-07-02 22:45:58.664304+00:00
@@ -22,13 +22,18 @@
)

# Known transient cudagraph/TRT-driver flake signatures. Expand ONLY with
# concrete evidence — a broad regex hides real bugs.
_RERUN_ARGS = [
-    "--reruns", "1", "--reruns-delay", "5",
-    "--only-rerun", "cudaErrorStreamCaptureInvalidated",
-    "--only-rerun", "Stream capture invalidated",
+    "--reruns",
+    "1",
+    "--reruns-delay",
+    "5",
+    "--only-rerun",
+    "cudaErrorStreamCaptureInvalidated",
+    "--only-rerun",
+    "Stream capture invalidated",
]


def _launcher() -> list[str]:
    """The python/pytest launcher. CI leaves PYTHON unset (-> container python);
@@ -110,18 +115,35 @@
    """(argv, cwd) pairs for a named setup step."""
    launcher = _launcher()
    if step == "hub":
        return [(launcher + ["hub.py"], REPO_ROOT / "tests/modules")]
    if step == "executorch":
-        return [(launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
-                 REPO_ROOT)]
+        return [
+            (
+                launcher + ["-m", "pip", "install", "pyyaml", "executorch>=1.3.1"],
+                REPO_ROOT,
+            )
+        ]
    if step == "cuda-core":
-        return [(launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"],
-                 REPO_ROOT)]
+        return [
+            (launcher + ["-m", "pip", "install", "cuda-python", "cuda-core"], REPO_ROOT)
+        ]
    if step == "mpi":
-        return [(["dnf", "install", "-y", "mpich", "mpich-devel",
-                  "openmpi", "openmpi-devel"], REPO_ROOT)]
+        return [
+            (
+                [
+                    "dnf",
+                    "install",
+                    "-y",
+                    "mpich",
+                    "mpich-devel",
+                    "openmpi",
+                    "openmpi-devel",
+                ],
+                REPO_ROOT,
+            )
+        ]
    raise KeyError(f"unknown setup step {step!r} in a suite definition")


def describe(suite: Suite, variant: Variant) -> str:
    """The full command line, for --dry-run / show (quoting-safe display)."""
@@ -148,11 +170,13 @@
    process exit code (non-zero on first failure), mirroring the bash tiers."""
    v = suite.for_variant(variant)
    extra = extra or []
    env = {**os.environ, **{k: str(val) for k, val in v["env"].items()}}
    cwd = REPO_ROOT / v["cwd"]
-    pytest_cmd = _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    pytest_cmd = (
+        _launcher() + ["-m", "pytest"] + build_pytest_args(suite, variant) + extra
+    )

    if dry_run:
        print(describe(suite, variant))
        if extra:
            print(f"  # + extra pytest args: {shlex.join(extra)}")
@@ -166,14 +190,19 @@
                print(f"::warning::setup step {step!r} exited {rc}", flush=True)

    print(f"==> {suite.name} [{variant}]: {shlex.join(pytest_cmd)}", flush=True)
    rc = subprocess.run(pytest_cmd, cwd=cwd, env=env).returncode
    if rc != 0:
-        repro = shlex.join(["uv", "run", "--no-sync", "pytest"]
-                           + build_pytest_args(suite, variant) + extra)
-        print(f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
-              flush=True)
+        repro = shlex.join(
+            ["uv", "run", "--no-sync", "pytest"]
+            + build_pytest_args(suite, variant)
+            + extra
+        )
+        print(
+            f"::warning::{suite.name} failed. Reproduce: cd {v['cwd']} && {repro}",
+            flush=True,
+        )
        return rc

    for f in v["follow"]:
        fcmd = _launcher() + list(f)
        print(f"==> {suite.name} follow: {shlex.join(fcmd)}", flush=True)
@@ -209,9 +238,13 @@


def matrix(**filters: str | None) -> list[dict[str, str]]:
    """GitHub-Actions matrix ``include`` entries for the selected jobs."""
    return [
-        {"suite": s.name, "variant": var, "tier": s.tier,
-         "cwd": s.for_variant(var)["cwd"]}
+        {
+            "suite": s.name,
+            "variant": var,
+            "tier": s.tier,
+            "cwd": s.for_variant(var)["cwd"],
+        }
        for s, var in select(**filters)
    ]
--- /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-02 22:45:37.649152+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/ci/suites.py	2026-07-02 22:45:58.683233+00:00
@@ -54,172 +54,286 @@
    """

    name: str
    tier: Tier
    lanes: tuple[Lane, ...]
-    cwd: str = "tests/py/dynamo"          # relative to repo root
-    paths: tuple[str, ...] = ()           # pytest positionals (rel to cwd); globs ok
-    markers: str | None = None            # -m EXPR
-    keyword: str | None = None            # -k EXPR
-    dist: str | None = None               # --dist=loadscope
-    maxfail: int | None = None            # --maxfail=N
-    ir: str | None = None                 # --ir torch_compile
-    jobs: str | None = None               # xdist default: None=serial, "8"/"auto"/"4"
-    reruns: bool = True                   # wrap in the flake-rerun helper
-    verbose: bool = False                 # -v
+    cwd: str = "tests/py/dynamo"  # relative to repo root
+    paths: tuple[str, ...] = ()  # pytest positionals (rel to cwd); globs ok
+    markers: str | None = None  # -m EXPR
+    keyword: str | None = None  # -k EXPR
+    dist: str | None = None  # --dist=loadscope
+    maxfail: int | None = None  # --maxfail=N
+    ir: str | None = None  # --ir torch_compile
+    jobs: str | None = None  # xdist default: None=serial, "8"/"auto"/"4"
+    reruns: bool = True  # wrap in the flake-rerun helper
+    verbose: bool = False  # -v
    variants: tuple[Variant, ...] = ALL_VARIANTS
-    platforms: tuple[Platform, ...] = ALL_PLATFORMS   # channels this suite runs on
-    setup: tuple[str, ...] = ()           # named pre-steps: hub|executorch|cuda-core|mpi
-    follow: tuple[tuple[str, ...], ...] = ()   # extra argv to run AFTER pytest
+    platforms: tuple[Platform, ...] = ALL_PLATFORMS  # channels this suite runs on
+    setup: tuple[str, ...] = ()  # named pre-steps: hub|executorch|cuda-core|mpi
+    follow: tuple[tuple[str, ...], ...] = ()  # extra argv to run AFTER pytest
    env: dict[str, str] = field(default_factory=dict)
    overrides: dict[str, dict[str, Any]] = field(default_factory=dict)  # per-variant

    def for_variant(self, variant: Variant) -> dict[str, Any]:
        """This suite's effective fields for ``variant`` (applies overrides)."""
-        base = {f: getattr(self, f) for f in (
-            "cwd", "paths", "markers", "keyword", "dist", "maxfail",
-            "ir", "jobs", "reruns", "verbose", "setup", "follow", "env",
-        )}
+        base = {
+            f: getattr(self, f)
+            for f in (
+                "cwd",
+                "paths",
+                "markers",
+                "keyword",
+                "dist",
+                "maxfail",
+                "ir",
+                "jobs",
+                "reruns",
+                "verbose",
+                "setup",
+                "follow",
+                "env",
+            )
+        }
        base.update(self.overrides.get(variant, {}))
        return base


# ── L0 — smoke / fast lane ────────────────────────────────────────────────────
_L0: list[Suite] = [
    Suite(
-        "dynamo-converters", tier="l0", lanes=("fast", "full"),
-        paths=("conversion/",), dist="--dist=loadscope", maxfail=20, jobs="8",
+        "dynamo-converters",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("conversion/",),
+        dist="--dist=loadscope",
+        maxfail=20,
+        jobs="8",
        # RTX does not shard converters with loadscope.
        overrides={"rtx": {"dist": None}},
    ),
    Suite(
-        "dynamo-runtime-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("runtime/test_000_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning-smoke", tier="l0", lanes=("fast", "full"),
-        paths=("partitioning/test_000_*",), jobs="8",
+        "dynamo-runtime-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("runtime/test_000_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning-smoke",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("partitioning/test_000_*",),
+        jobs="8",
        # RTX runs the whole partitioning suite (no smoke subset split).
        overrides={"rtx": {"paths": ("partitioning/",)}},
    ),
    Suite(
-        "dynamo-lowering", tier="l0", lanes=("fast", "full"),
-        paths=("lowering/",), jobs="8",
-    ),
-    Suite(
-        "py-core", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/core", paths=(".",), jobs="8",
-    ),
-    Suite(
-        "ts-api", tier="l0", lanes=("fast", "full"),
-        cwd="tests/py/ts", paths=("api/",), setup=("hub",), variants=("standard",),
+        "dynamo-lowering",
+        tier="l0",
+        lanes=("fast", "full"),
+        paths=("lowering/",),
+        jobs="8",
+    ),
+    Suite(
+        "py-core",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/core",
+        paths=(".",),
+        jobs="8",
+    ),
+    Suite(
+        "ts-api",
+        tier="l0",
+        lanes=("fast", "full"),
+        cwd="tests/py/ts",
+        paths=("api/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L1 — critical-path / full lane ────────────────────────────────────────────
_L1: list[Suite] = [
    Suite(
-        "dynamo-runtime", tier="l1", lanes=("full",),
-        paths=("runtime/test_001_*",), jobs="8",
-    ),
-    Suite(
-        "dynamo-partitioning", tier="l1", lanes=("full",),
-        paths=("partitioning/test_001_*",), jobs="8", variants=("standard",),
+        "dynamo-runtime",
+        tier="l1",
+        lanes=("full",),
+        paths=("runtime/test_001_*",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-partitioning",
+        tier="l1",
+        lanes=("full",),
+        paths=("partitioning/test_001_*",),
+        jobs="8",
+        variants=("standard",),
    ),
    Suite(
        # Was run in BOTH l0_core (std) and l1_dynamo_core (both) — deduped to once.
-        "dynamo-hlo", tier="l1", lanes=("full",),
-        paths=("hlo/",), jobs="8",
-    ),
-    Suite(
-        "dynamo-models-critical", tier="l1", lanes=("full",),
-        paths=("models/",), markers="critical",
-    ),
-    Suite(
-        "torch-compile-backend", tier="l1", lanes=("full",),
+        "dynamo-hlo",
+        tier="l1",
+        lanes=("full",),
+        paths=("hlo/",),
+        jobs="8",
+    ),
+    Suite(
+        "dynamo-models-critical",
+        tier="l1",
+        lanes=("full",),
+        paths=("models/",),
+        markers="critical",
+    ),
+    Suite(
+        "torch-compile-backend",
+        tier="l1",
+        lanes=("full",),
        paths=("backend/",),
    ),
    Suite(
-        "torch-compile-models-critical", tier="l1", lanes=("full",),
+        "torch-compile-models-critical",
+        tier="l1",
+        lanes=("full",),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="critical", ir="torch_compile",
-    ),
-    Suite(
-        "ts-models", tier="l1", lanes=("full",),
-        cwd="tests/py/ts", paths=("models/",), setup=("hub",), variants=("standard",),
+        markers="critical",
+        ir="torch_compile",
+    ),
+    Suite(
+        "ts-models",
+        tier="l1",
+        lanes=("full",),
+        cwd="tests/py/ts",
+        paths=("models/",),
+        setup=("hub",),
+        variants=("standard",),
    ),
]

# ── L2 — exhaustive / full + nightly ──────────────────────────────────────────
_L2: list[Suite] = [
    Suite(
-        "torch-compile-models", tier="l2", lanes=("full", "nightly"),
+        "torch-compile-models",
+        tier="l2",
+        lanes=("full", "nightly"),
        paths=("models/test_models.py", "models/test_dyn_models.py"),
-        markers="not critical", ir="torch_compile", jobs="auto",
-    ),
-    Suite(
-        "dynamo-models", tier="l2", lanes=("full", "nightly"),
-        paths=("models/",), markers="not critical", jobs="auto",
-    ),
-    Suite(
-        "dynamo-llm", tier="l2", lanes=("nightly",),
-        paths=("llm/",), jobs="auto",
-    ),
-    Suite(
-        "dynamo-runtime-full", tier="l2", lanes=("full", "nightly"),
-        paths=("runtime/",), keyword="not test_000_ and not test_001_", jobs="auto",
-    ),
-    Suite(
-        "executorch", tier="l2", lanes=("nightly",),
-        paths=("executorch/",), setup=("executorch",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
+        markers="not critical",
+        ir="torch_compile",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-models",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("models/",),
+        markers="not critical",
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-llm",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("llm/",),
+        jobs="auto",
+    ),
+    Suite(
+        "dynamo-runtime-full",
+        tier="l2",
+        lanes=("full", "nightly"),
+        paths=("runtime/",),
+        keyword="not test_000_ and not test_001_",
+        jobs="auto",
+    ),
+    Suite(
+        "executorch",
+        tier="l2",
+        lanes=("nightly",),
+        paths=("executorch/",),
+        setup=("executorch",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
    ),
    Suite(
        # Standard: the automatic-plugin trio. RTX: the whole automatic_plugin dir.
        # (The redundant conversion/ re-run from the old l2_plugin is dropped.)
-        "plugins-automatic", tier="l2", lanes=("nightly",), jobs="auto",
+        "plugins-automatic",
+        tier="l2",
+        lanes=("nightly",),
+        jobs="auto",
        paths=(
            "automatic_plugin/test_automatic_plugin.py",
            "automatic_plugin/test_automatic_plugin_with_attrs.py",
            "automatic_plugin/test_flashinfer_rmsnorm.py",
        ),
        overrides={"rtx": {"paths": ("automatic_plugin/",)}},
    ),
    Suite(
-        "kernels", tier="l2", lanes=("nightly",),
-        cwd="tests/py/kernels", paths=(".",), setup=("cuda-core",), jobs="auto",
-        variants=("standard",), platforms=("linux-x86_64",),
-    ),
-    Suite(
-        "ts-integrations", tier="l2", lanes=("nightly",),
-        cwd="tests/py/ts", paths=("integrations/",), setup=("hub",), jobs="auto",
-        variants=("standard",),
-    ),
-    Suite(
-        "distributed", tier="l2", lanes=("nightly",),
+        "kernels",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/kernels",
+        paths=(".",),
+        setup=("cuda-core",),
+        jobs="auto",
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+    ),
+    Suite(
+        "ts-integrations",
+        tier="l2",
+        lanes=("nightly",),
+        cwd="tests/py/ts",
+        paths=("integrations/",),
+        setup=("hub",),
+        jobs="auto",
+        variants=("standard",),
+    ),
+    Suite(
+        "distributed",
+        tier="l2",
+        lanes=("nightly",),
        paths=(
            "distributed/test_nccl_ops.py",
            "distributed/test_native_nccl.py",
            "distributed/test_export_save_load.py",
        ),
-        jobs="auto", verbose=True, reruns=False, variants=("standard",),
-        platforms=("linux-x86_64",), setup=("mpi",),
+        jobs="auto",
+        verbose=True,
+        reruns=False,
+        variants=("standard",),
+        platforms=("linux-x86_64",),
+        setup=("mpi",),
        env={"USE_HOST_DEPS": "1", "CI_BUILD": "1", "USE_TRTLLM_PLUGINS": "1"},
        follow=(
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_native_nccl.py", "--multirank"),
-            ("-m", "torch_tensorrt.distributed.run", "--nproc_per_node=2",
-             "distributed/test_export_save_load.py", "--multirank"),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_native_nccl.py",
+                "--multirank",
+            ),
+            (
+                "-m",
+                "torch_tensorrt.distributed.run",
+                "--nproc_per_node=2",
+                "distributed/test_export_save_load.py",
+                "--multirank",
+            ),
        ),
    ),
]

# ── python-only — validates the PYTHON_ONLY=1 wheel against the runtime suite ──
_PYTHON_ONLY: list[Suite] = [
    Suite(
-        "python-only-runtime", tier="l1", lanes=("python-only",),
-        paths=("runtime/",), jobs="8", variants=("standard",),
+        "python-only-runtime",
+        tier="l1",
+        lanes=("python-only",),
+        paths=("runtime/",),
+        jobs="8",
+        variants=("standard",),
    ),
]

SUITES: tuple[Suite, ...] = tuple(_L0 + _L1 + _L2 + _PYTHON_ONLY)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant