diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 94940e7c27..48ab018910 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -2,6 +2,12 @@ [1.0.4] - 2026-XX-XX -------------------- +**Features** + +- CLI commands that load a tree sequence now read from stdin when the input + path argument is omitted. (:user:`chris-a-talbot`, :user:`jeromekelleher`, + :issue:`3468`, :pr:`3469`) + -------------------- [1.0.3] - 2026-05-14 -------------------- diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 66be01e6d2..63b326c140 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -26,6 +26,7 @@ import io import os +import subprocess import sys import tempfile import unittest @@ -310,6 +311,29 @@ def test_vcf_allow_position_zero(self, flags, expected): assert args.tree_sequence == tree_sequence assert args.allow_position_zero == expected + @pytest.mark.parametrize( + "cmd", + [ + "info", + "trees", + "vcf", + "nodes", + "edges", + "sites", + "mutations", + "migrations", + "individuals", + "populations", + "provenances", + ], + ) + def test_tree_sequence_argument_optional(self, cmd): + # Omitting the positional argument selects stdin (tree_sequence is None); + # providing a path stores the path string. + parser = cli.get_tskit_parser() + assert parser.parse_args([cmd]).tree_sequence is None + assert parser.parse_args([cmd, "test.trees"]).tree_sequence == "test.trees" + def test_info_default_values(self): parser = cli.get_tskit_parser() cmd = "info" @@ -560,6 +584,16 @@ def test_vcf(self): assert len(stderr) == 0 self.verify_vcf(stdout) + def test_vcf_stdin(self): + # Omitting the path argument reads the tree sequence from stdin. The + # low-level loader requires a real file descriptor, so sys.stdin must be + # patched with an actual open binary file (not e.g. an io.BytesIO). + with open(self._tree_sequence_file, "rb") as f: + with mock.patch("sys.stdin", f): + stdout, stderr = capture_output(cli.tskit_main, ["vcf", "-0"]) + assert len(stderr) == 0 + self.verify_vcf(stdout) + def verify_info(self, ts, output_info): assert str(ts) == output_info @@ -642,3 +676,134 @@ def test_migrations(self): def test_provenances(self): self.verify("provenances") + + +@pytest.fixture(scope="module") +def treeseq_file(tmp_path_factory): + """ + A tree sequence dumped to file, containing migrations, mutations and + individuals so that every loading subcommand has something to output. + """ + ts = msprime.simulate( + length=1, + recombination_rate=2, + mutation_rate=2, + random_seed=1, + migration_matrix=[[0, 1], [1, 0]], + population_configurations=[msprime.PopulationConfiguration(5) for _ in range(2)], + record_migrations=True, + ) + assert ts.num_migrations > 0 + ts = tsutil.insert_random_ploidy_individuals(ts, samples_only=True) + path = tmp_path_factory.mktemp("tsk_cli_stdin") / "stdin.trees" + ts.dump(path) + return str(path) + + +# The loading subcommands and any extra flags they need to produce output. +STDIN_SUBCOMMANDS = [ + ["info"], + ["trees"], + ["vcf", "-0"], + ["nodes"], + ["edges"], + ["sites"], + ["mutations"], + ["migrations"], + ["individuals"], + ["populations"], + ["provenances"], +] + + +class TestStdin: + """ + Tests that reading from stdin (omitting the path argument) produces the same + output as loading from a file, for every loading subcommand. + """ + + @pytest.mark.parametrize("subcommand", STDIN_SUBCOMMANDS) + def test_stdin_matches_file(self, treeseq_file, subcommand): + file_stdout, file_stderr = capture_output( + cli.tskit_main, [*subcommand, treeseq_file] + ) + with open(treeseq_file, "rb") as f: + with mock.patch("sys.stdin", f): + stdin_stdout, stdin_stderr = capture_output(cli.tskit_main, subcommand) + assert file_stderr == "" + assert stdin_stderr == "" + assert len(file_stdout) > 0 + assert stdin_stdout == file_stdout + + +class TestStdinErrors: + """ + Tests that errors loading from stdin are reported cleanly. + """ + + def run_info_stdin(self, path): + with mock.patch("sys.exit", side_effect=TestException) as mocked_exit: + with open(path, "rb") as f: + with mock.patch("sys.stdin", f): + with pytest.raises(TestException): + capture_output(cli.tskit_main, ["info"]) + return mocked_exit.call_args[0][0] + + def test_empty_stdin(self, tmp_path): + path = tmp_path / "empty.trees" + path.write_bytes(b"") + assert self.run_info_stdin(path) == "Load error: End of file" + + def test_garbage_stdin(self, tmp_path): + path = tmp_path / "garbage.trees" + path.write_bytes(b"not a tree sequence at all") + message = self.run_info_stdin(path) + assert message.startswith("Load error: File not in kastore format") + + def test_truncated_stdin(self, tmp_path, treeseq_file): + path = tmp_path / "truncated.trees" + with open(treeseq_file, "rb") as f: + path.write_bytes(f.read(100)) + message = self.run_info_stdin(path) + assert message.startswith("Load error: File not in kastore format") + + +class TestStdinSubprocess: + """ + End-to-end tests that feed the tree sequence through a real OS pipe. Unlike + the in-process tests (which mock sys.stdin with a seekable file), these + exercise the genuine non-seekable stdin path. + """ + + def run_cli(self, args, input_bytes): + # Force UTF-8 stdout in the child so that commands printing unicode (e.g. + # the box-drawing characters from "info") don't fail when stdout is a + # pipe on platforms that default to a non-UTF-8 codec (e.g. Windows). + env = {**os.environ, "PYTHONIOENCODING": "utf-8"} + return subprocess.run( + [sys.executable, "-m", "tskit", *args], + input=input_bytes, + capture_output=True, + env=env, + ) + + @pytest.mark.parametrize("subcommand", [["info"], ["vcf", "-0"], ["nodes"]]) + def test_stdin_pipe_matches_file(self, treeseq_file, subcommand): + with open(treeseq_file, "rb") as f: + ts_bytes = f.read() + stdin_result = self.run_cli(subcommand, ts_bytes) + file_result = self.run_cli([*subcommand, treeseq_file], b"") + assert stdin_result.returncode == 0 + assert stdin_result.stderr == b"" + assert len(stdin_result.stdout) > 0 + assert stdin_result.stdout == file_result.stdout + + def test_empty_pipe(self): + result = self.run_cli(["info"], b"") + assert result.returncode != 0 + assert b"End of file" in result.stderr + + def test_garbage_pipe(self): + result = self.run_cli(["info"], b"not a tree sequence") + assert result.returncode != 0 + assert b"not in kastore format" in result.stderr diff --git a/python/tskit/cli.py b/python/tskit/cli.py index 50e97d6784..b8af1eab9a 100644 --- a/python/tskit/cli.py +++ b/python/tskit/cli.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2018-2025 Tskit Developers +# Copyright (c) 2018-2026 Tskit Developers # Copyright (c) 2015-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -45,9 +45,11 @@ def sys_exit(message): def load_tree_sequence(path): + if path is None: + path = sys.stdin try: return tskit.load(path) - except OSError as e: + except (OSError, EOFError, tskit.FileFormatError) as e: sys_exit(f"Load error: {e}") @@ -134,7 +136,12 @@ def run_vcf(args): def add_tree_sequence_argument(parser): - parser.add_argument("tree_sequence", help="The tskit tree sequence file") + parser.add_argument( + "tree_sequence", + help="The tskit tree sequence file. If not provided, read from stdin.", + default=None, + nargs="?", + ) def add_precision_argument(parser):