From 13ee2ab0dde96521f558878ad36bd498e64b28e8 Mon Sep 17 00:00:00 2001 From: chris-a-talbot Date: Fri, 19 Jun 2026 19:47:21 -0400 Subject: [PATCH 1/3] Add stdin support to the tskit CLI Update changelog to reflect stdin CLI feature --- python/CHANGELOG.rst | 5 +++++ python/tests/test_cli.py | 22 ++++++++++++++++++++++ python/tskit/cli.py | 14 +++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 94940e7c27..5a5dcca5d8 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -2,6 +2,11 @@ [1.0.4] - 2026-XX-XX -------------------- +**Features** + +- CLI commands that load a tree sequence now accept ``-`` as the input path to + read from stdin. (:issue:`3468`) + -------------------- [1.0.3] - 2026-05-14 -------------------- diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 66be01e6d2..56a4cdc7e3 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -73,6 +73,11 @@ def capture_output(func, *args, **kwargs): return stdout_output, stderr_output +class MockStdIn: + def __init__(self, buffer): + self.buffer = buffer + + class TestCli(unittest.TestCase): """ Superclass of tests for the CLI needing temp files. @@ -310,6 +315,16 @@ def test_vcf_allow_position_zero(self, flags, expected): assert args.tree_sequence == tree_sequence assert args.allow_position_zero == expected + def test_vcf_stdin_file(self): + parser = cli.get_tskit_parser() + args = parser.parse_args(["vcf", "-"]) + assert args.tree_sequence == "-" + + def test_vcf_requires_tree_sequence(self): + parser = cli.get_tskit_parser() + with pytest.raises(SystemExit): + parser.parse_args(["vcf"]) + def test_info_default_values(self): parser = cli.get_tskit_parser() cmd = "info" @@ -560,6 +575,13 @@ def test_vcf(self): assert len(stderr) == 0 self.verify_vcf(stdout) + def test_vcf_stdin(self): + with open(self._tree_sequence_file, "rb") as f: + with mock.patch("sys.stdin", MockStdIn(f)): + stdout, stderr = capture_output(cli.tskit_main, ["vcf", "-0", "-"]) + assert len(stderr) == 0 + self.verify_vcf(stdout) + def verify_info(self, ts, output_info): assert str(ts) == output_info diff --git a/python/tskit/cli.py b/python/tskit/cli.py index 50e97d6784..dc56ad4587 100644 --- a/python/tskit/cli.py +++ b/python/tskit/cli.py @@ -45,10 +45,15 @@ def sys_exit(message): def load_tree_sequence(path): + if path in [None, "-"]: + path = getattr(sys.stdin, "buffer", sys.stdin) try: return tskit.load(path) - except OSError as e: - sys_exit(f"Load error: {e}") + except (OSError, EOFError, tskit.FileFormatError) as e: + message = str(e) + if isinstance(e, EOFError) and len(message) == 0: + message = "End of file" + sys_exit(f"Load error: {message}") def run_info(args): @@ -134,7 +139,10 @@ def run_vcf(args): def add_tree_sequence_argument(parser): - parser.add_argument("tree_sequence", help="The tskit tree sequence file") + parser.add_argument( + "tree_sequence", + help="The tskit tree sequence file, or '-' for stdin", + ) def add_precision_argument(parser): From 7b37132659a9a8105e87ad54f1321d6d57959b7c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 23 Jun 2026 16:12:47 +0100 Subject: [PATCH 2/3] Simplify CLI stdin implementation Update the CLI tests to cover reading a tree sequence from stdin when the positional path argument is omitted, replacing the stale tests written for the earlier '-' convention. Adds in-process equivalence tests across all loading subcommands, stdin error-path tests, and end-to-end subprocess tests that exercise a real non-seekable pipe. Also removes a dead comment in cli.py and updates the changelog to describe the omit-argument behaviour. --- python/CHANGELOG.rst | 5 +- python/tests/test_cli.py | 168 +++++++++++++++++++++++++++++++++++---- python/tskit/cli.py | 15 ++-- 3 files changed, 163 insertions(+), 25 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 5a5dcca5d8..48ab018910 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -4,8 +4,9 @@ **Features** -- CLI commands that load a tree sequence now accept ``-`` as the input path to - read from stdin. (:issue:`3468`) +- CLI commands that load a tree sequence now read from stdin when the input + path argument is omitted. (:user:`chris-a-talbot`, :user:`jeromekelleher`, + :issue:`3468`, :pr:`3469`) -------------------- [1.0.3] - 2026-05-14 diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 56a4cdc7e3..f2ae4ec0cb 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -26,6 +26,7 @@ import io import os +import subprocess import sys import tempfile import unittest @@ -73,11 +74,6 @@ def capture_output(func, *args, **kwargs): return stdout_output, stderr_output -class MockStdIn: - def __init__(self, buffer): - self.buffer = buffer - - class TestCli(unittest.TestCase): """ Superclass of tests for the CLI needing temp files. @@ -315,15 +311,28 @@ def test_vcf_allow_position_zero(self, flags, expected): assert args.tree_sequence == tree_sequence assert args.allow_position_zero == expected - def test_vcf_stdin_file(self): - parser = cli.get_tskit_parser() - args = parser.parse_args(["vcf", "-"]) - assert args.tree_sequence == "-" - - def test_vcf_requires_tree_sequence(self): + @pytest.mark.parametrize( + "cmd", + [ + "info", + "trees", + "vcf", + "nodes", + "edges", + "sites", + "mutations", + "migrations", + "individuals", + "populations", + "provenances", + ], + ) + def test_tree_sequence_argument_optional(self, cmd): + # Omitting the positional argument selects stdin (tree_sequence is None); + # providing a path stores the path string. parser = cli.get_tskit_parser() - with pytest.raises(SystemExit): - parser.parse_args(["vcf"]) + assert parser.parse_args([cmd]).tree_sequence is None + assert parser.parse_args([cmd, "test.trees"]).tree_sequence == "test.trees" def test_info_default_values(self): parser = cli.get_tskit_parser() @@ -576,9 +585,12 @@ def test_vcf(self): self.verify_vcf(stdout) def test_vcf_stdin(self): + # Omitting the path argument reads the tree sequence from stdin. The + # low-level loader requires a real file descriptor, so sys.stdin must be + # patched with an actual open binary file (not e.g. an io.BytesIO). with open(self._tree_sequence_file, "rb") as f: - with mock.patch("sys.stdin", MockStdIn(f)): - stdout, stderr = capture_output(cli.tskit_main, ["vcf", "-0", "-"]) + with mock.patch("sys.stdin", f): + stdout, stderr = capture_output(cli.tskit_main, ["vcf", "-0"]) assert len(stderr) == 0 self.verify_vcf(stdout) @@ -664,3 +676,129 @@ def test_migrations(self): def test_provenances(self): self.verify("provenances") + + +@pytest.fixture(scope="module") +def treeseq_file(tmp_path_factory): + """ + A tree sequence dumped to file, containing migrations, mutations and + individuals so that every loading subcommand has something to output. + """ + ts = msprime.simulate( + length=1, + recombination_rate=2, + mutation_rate=2, + random_seed=1, + migration_matrix=[[0, 1], [1, 0]], + population_configurations=[msprime.PopulationConfiguration(5) for _ in range(2)], + record_migrations=True, + ) + assert ts.num_migrations > 0 + ts = tsutil.insert_random_ploidy_individuals(ts, samples_only=True) + path = tmp_path_factory.mktemp("tsk_cli_stdin") / "stdin.trees" + ts.dump(path) + return str(path) + + +# The loading subcommands and any extra flags they need to produce output. +STDIN_SUBCOMMANDS = [ + ["info"], + ["trees"], + ["vcf", "-0"], + ["nodes"], + ["edges"], + ["sites"], + ["mutations"], + ["migrations"], + ["individuals"], + ["populations"], + ["provenances"], +] + + +class TestStdin: + """ + Tests that reading from stdin (omitting the path argument) produces the same + output as loading from a file, for every loading subcommand. + """ + + @pytest.mark.parametrize("subcommand", STDIN_SUBCOMMANDS) + def test_stdin_matches_file(self, treeseq_file, subcommand): + file_stdout, file_stderr = capture_output( + cli.tskit_main, [*subcommand, treeseq_file] + ) + with open(treeseq_file, "rb") as f: + with mock.patch("sys.stdin", f): + stdin_stdout, stdin_stderr = capture_output(cli.tskit_main, subcommand) + assert file_stderr == "" + assert stdin_stderr == "" + assert len(file_stdout) > 0 + assert stdin_stdout == file_stdout + + +class TestStdinErrors: + """ + Tests that errors loading from stdin are reported cleanly. + """ + + def run_info_stdin(self, path): + with mock.patch("sys.exit", side_effect=TestException) as mocked_exit: + with open(path, "rb") as f: + with mock.patch("sys.stdin", f): + with pytest.raises(TestException): + capture_output(cli.tskit_main, ["info"]) + return mocked_exit.call_args[0][0] + + def test_empty_stdin(self, tmp_path): + path = tmp_path / "empty.trees" + path.write_bytes(b"") + assert self.run_info_stdin(path) == "Load error: End of file" + + def test_garbage_stdin(self, tmp_path): + path = tmp_path / "garbage.trees" + path.write_bytes(b"not a tree sequence at all") + message = self.run_info_stdin(path) + assert message.startswith("Load error: File not in kastore format") + + def test_truncated_stdin(self, tmp_path, treeseq_file): + path = tmp_path / "truncated.trees" + with open(treeseq_file, "rb") as f: + path.write_bytes(f.read(100)) + message = self.run_info_stdin(path) + assert message.startswith("Load error: File not in kastore format") + + +class TestStdinSubprocess: + """ + End-to-end tests that feed the tree sequence through a real OS pipe. Unlike + the in-process tests (which mock sys.stdin with a seekable file), these + exercise the genuine non-seekable stdin path. + """ + + def run_cli(self, args, input_bytes): + return subprocess.run( + [sys.executable, "-m", "tskit", *args], + input=input_bytes, + capture_output=True, + ) + + @pytest.mark.parametrize("subcommand", [["info"], ["vcf", "-0"], ["nodes"]]) + def test_stdin_pipe_matches_file(self, treeseq_file, subcommand): + with open(treeseq_file, "rb") as f: + ts_bytes = f.read() + stdin_result = self.run_cli(subcommand, ts_bytes) + file_result = self.run_cli([*subcommand, treeseq_file], b"") + assert stdin_result.returncode == 0 + assert stdin_result.stderr == b"" + assert len(stdin_result.stdout) > 0 + assert stdin_result.stdout == file_result.stdout + + def test_empty_pipe(self): + result = self.run_cli(["info"], b"") + assert result.returncode != 0 + assert b"End of file" in result.stderr + + def test_garbage_pipe(self): + result = self.run_cli(["info"], b"not a tree sequence") + assert result.returncode != 0 + assert b"not in kastore format" in result.stderr diff --git a/python/tskit/cli.py b/python/tskit/cli.py index dc56ad4587..b8af1eab9a 100644 --- a/python/tskit/cli.py +++ b/python/tskit/cli.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2018-2025 Tskit Developers +# Copyright (c) 2018-2026 Tskit Developers # Copyright (c) 2015-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -45,15 +45,12 @@ def sys_exit(message): def load_tree_sequence(path): - if path in [None, "-"]: - path = getattr(sys.stdin, "buffer", sys.stdin) + if path is None: + path = sys.stdin try: return tskit.load(path) except (OSError, EOFError, tskit.FileFormatError) as e: - message = str(e) - if isinstance(e, EOFError) and len(message) == 0: - message = "End of file" - sys_exit(f"Load error: {message}") + sys_exit(f"Load error: {e}") def run_info(args): @@ -141,7 +138,9 @@ def run_vcf(args): def add_tree_sequence_argument(parser): parser.add_argument( "tree_sequence", - help="The tskit tree sequence file, or '-' for stdin", + help="The tskit tree sequence file. If not provided, read from stdin.", + default=None, + nargs="?", ) From 780edc014a1e3a4b64d2b0bf31c264df445dbb24 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 25 Jun 2026 10:36:55 +0100 Subject: [PATCH 3/3] Force UTF-8 in CLI subprocess tests for Windows The info command prints unicode box-drawing characters, which crash the child process with a UnicodeEncodeError when stdout is a pipe on platforms defaulting to a non-UTF-8 codec (e.g. Windows). Set PYTHONIOENCODING=utf-8 in the subprocess environment. --- python/tests/test_cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index f2ae4ec0cb..63b326c140 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -776,10 +776,15 @@ class TestStdinSubprocess: """ def run_cli(self, args, input_bytes): + # Force UTF-8 stdout in the child so that commands printing unicode (e.g. + # the box-drawing characters from "info") don't fail when stdout is a + # pipe on platforms that default to a non-UTF-8 codec (e.g. Windows). + env = {**os.environ, "PYTHONIOENCODING": "utf-8"} return subprocess.run( [sys.executable, "-m", "tskit", *args], input=input_bytes, capture_output=True, + env=env, ) @pytest.mark.parametrize("subcommand", [["info"], ["vcf", "-0"], ["nodes"]])