From d023fdd32d1c72405a7b3d08cf38d959773f0320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Wr=C3=B3bel?= Date: Fri, 3 Jul 2026 12:27:31 +0200 Subject: [PATCH] Enhance integration tests readiness validation by introducing a new regex for the "Integration Readiness" field. Update logic to prioritize declared readiness over rationale explanations in both contract and precheck services. Add corresponding unit tests to ensure correct behavior in edge cases. --- backend/app/services/contract_validator.py | 71 +++++++++++++-- .../test/services/test_contract_validator.py | 91 +++++++++++++++++++ .../services/run_generation_precheck.py | 49 +++++++++- .../tests/test_run_generation_precheck.py | 60 ++++++++++++ 4 files changed, 262 insertions(+), 9 deletions(-) diff --git a/backend/app/services/contract_validator.py b/backend/app/services/contract_validator.py index 5ef14c9..320dc18 100644 --- a/backend/app/services/contract_validator.py +++ b/backend/app/services/contract_validator.py @@ -122,9 +122,26 @@ def canonical_path(self) -> str: # Part F section header pattern — same regex as MCP precheck for behavior parity. _PART_F_HEADER = re.compile(r"^#+\s*part\s*f\b", re.IGNORECASE | re.MULTILINE) +# Any "Part " heading — used to bound Part F to its own section instead of +# scanning to end-of-document (a later Part, or a summary line restating Part F, +# must not be read as the readiness declaration). +_PART_HEADER = re.compile(r"^#+\s*part\s*[a-z]\b", re.IGNORECASE | re.MULTILINE) _INTEGRATION_READY_TOKEN = re.compile( r"integration[_\s-]*tests?[_\s-]*ready", re.IGNORECASE ) +# The authoritative "**Integration Readiness:** " declaration. Anchored to a +# single line (^...$, MULTILINE) so a prose sentence that merely mentions the label +# — before or after the real field — can never be captured instead of it. +_READINESS_FIELD = re.compile( + r"^\s*\**\s*integration\s+readiness[\s*:]{1,12}" + r"(?:(?Pintegration[_\s-]*tests?[_\s-]*ready)" + r"|(?Plocal[_\s-]*only|not[_\s-]*ready))" + r"[\s*.]{0,6}$", + re.IGNORECASE | re.MULTILINE, +) +# Detects an attempted (but unparseable) declaration — present so callers can refuse +# rather than guess, instead of silently falling back to a whole-section token scan. +_READINESS_LABEL = re.compile(r"integration\s+readiness", re.IGNORECASE) # Heading match is intentionally lenient (any heading level h1–h6) so the # deterministic preflight never rejects a plan the downstream LLM conversion agent # would accept. The skill emits `## Phase N:`, but a hand-edited `# Phase N` must @@ -286,6 +303,18 @@ def normalize_contract_files( ), ) + part_f = _part_f_section(analysis_text) + if part_f is not None and _readiness_field_is_ambiguous(part_f): + raise ContractRejection( + code=RejectionCode.ANALYSIS_UNREADABLE, + message=( + f"Couldn't read integration readiness from `{SPEC_COMPLETENESS_FILE}`. " + f"Part F declares an Integration Readiness field, but its value isn't " + f"`INTEGRATION_TESTS_READY` or `LOCAL_ONLY`. Re-run " + f"`check_specification_completeness`." + ), + ) + result: dict[str, Path] = {"analysis": analysis_path, "plan": plan_path} if is_integration_tests_ready(analysis_text): @@ -395,28 +424,54 @@ def validate_generation_contract_preflight( def _part_f_section(analysis_text: str) -> str | None: - """Return the text from the Part F header to end of document, or None if absent. + """Return the text of the Part F section only, or None if absent. - Readiness must be read from Part F only — the integration-readiness token may - legitimately appear elsewhere (a Part B summary table, a preamble note) without - meaning the spec is integration-ready. + Bounded at the next "Part " heading (or end of document if there is + none) so a later section — or a summary line elsewhere that merely restates + "Part F (Integration Readiness): ..." — is never scanned as part of Part F. """ match = _PART_F_HEADER.search(analysis_text) if match is None: return None - return analysis_text[match.start():] + next_part = _PART_HEADER.search(analysis_text, match.end()) + end = next_part.start() if next_part is not None else len(analysis_text) + return analysis_text[match.start():end] + + +def _readiness_field_is_ambiguous(part_f: str) -> bool: + """True if Part F attempts an "Integration Readiness" declaration that doesn't + parse to a recognized token — e.g. a paraphrase or a value split across + decoration the field regex doesn't recognize. + + Distinguishes "field missing entirely" (legitimate non-standard formatting, + left to the lenient fallback in ``is_integration_tests_ready``) from "field + present but unreadable" (refuse rather than guess, per callers below). + """ + return _READINESS_LABEL.search(part_f) is not None and _READINESS_FIELD.search(part_f) is None def is_integration_tests_ready(analysis_text: str) -> bool: """Return True if the analysis file declares INTEGRATION_TESTS_READY in Part F. - Only the Part F section is scanned. A document with no Part F header is treated - as not-ready here; genuine "Part F missing" cases are rejected earlier in - ``normalize_contract_files`` with a clearer ANALYSIS_UNREADABLE error. + Only the Part F section is scanned. Within it, the "**Integration Readiness:**" + field — anchored to its own line — is checked first, so neither a preamble + sentence nor a Rationale sentence that merely mentions the other token can be + captured in its place. If no such field is found at all (non-standard + formatting with no labeled declaration), falls back to scanning the whole + Part F section for the token. If a field IS attempted but doesn't parse, the + caller is expected to have already rejected via ``_readiness_field_is_ambiguous`` + rather than reach this fallback. + + A document with no Part F header is treated as not-ready here; genuine "Part F + missing" cases are rejected earlier in ``normalize_contract_files`` with a + clearer ANALYSIS_UNREADABLE error. """ part_f = _part_f_section(analysis_text) if part_f is None: return False + field_match = _READINESS_FIELD.search(part_f) + if field_match is not None: + return field_match.group("ready") is not None return bool(_INTEGRATION_READY_TOKEN.search(part_f)) diff --git a/backend/test/services/test_contract_validator.py b/backend/test/services/test_contract_validator.py index 9b57f09..88f5569 100644 --- a/backend/test/services/test_contract_validator.py +++ b/backend/test/services/test_contract_validator.py @@ -11,6 +11,7 @@ _find_candidates, _CanonicalFile, _normalize_for_match, + _readiness_field_is_ambiguous, ) from app.core.artifact_files import ( SPEC_COMPLETENESS_FILE, @@ -205,6 +206,22 @@ def test_rejection_message_contains_mcp_tool_name(self, tmp_path): normalize_contract_files(tmp_path, "docs", _logger()) assert "check_specification_completeness" in exc_info.value.message + def test_analysis_unreadable_when_readiness_field_ambiguous(self, tmp_path): + """A Part F that attempts a declaration but doesn't parse must be refused, + not silently guessed via the whole-section fallback.""" + _setup_outputs(tmp_path) + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** Currently LOCAL_ONLY due to missing CI\n\n" + "**Rationale:** Meets neither criteria for INTEGRATION_TESTS_READY.\n" + ) + _write(tmp_path, "docs", ANALYSIS_SUBDIR, SPEC_COMPLETENESS_FILE, analysis) + _write(tmp_path, "docs", PLANNING_SUBDIR, IMPLEMENTATION_PLAN_FILE, _MINIMAL_PLAN) + with pytest.raises(ContractRejection) as exc_info: + normalize_contract_files(tmp_path, "docs", _logger()) + assert exc_info.value.code == RejectionCode.ANALYSIS_UNREADABLE + assert "check_specification_completeness" in exc_info.value.message + class TestGenerationContractPreflight: def test_rejects_plan_with_no_phase_headings(self, tmp_path): @@ -317,3 +334,77 @@ def test_no_part_f_returns_false(self): # No Part F header at all → not ready here; the missing-Part-F case is # rejected separately by normalize_contract_files with ANALYSIS_UNREADABLE. assert is_integration_tests_ready("# Part A\n\nINTEGRATION_TESTS_READY\n") is False + + def test_token_in_part_f_rationale_does_not_override_declared_field(self): + # Real-world regression: the field declares LOCAL_ONLY, but the Rationale + # prose explains the gap by naming the other token. Must not flip to ready. + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** **LOCAL_ONLY**\n\n" + "**Rationale:** The deployment target is named, but there are no deploy " + "workflow files or IaC, and no acceptance/e2e test methodology. This meets " + "neither the deployment-methodology nor the acceptance-test criteria for " + "`INTEGRATION_TESTS_READY`.\n" + ) + assert is_integration_tests_ready(analysis) is False + + def test_declared_field_integration_ready_wins_over_rationale_wording(self): + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** **INTEGRATION_TESTS_READY**\n\n" + "**Rationale:** All deploy workflows, IaC, and e2e methodology are present " + "and confirmed working, well beyond LOCAL_ONLY.\n" + ) + assert is_integration_tests_ready(analysis) is True + + def test_preamble_mention_does_not_hijack_the_declared_field(self): + # Regression: an earlier sentence naming "integration readiness" (before the + # real field) must not be captured in place of the actual declaration. + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "Initial integration readiness: LOCAL_ONLY was assumed, but after " + "reviewing the CI workflows we upgraded the classification.\n\n" + "**Integration Readiness:** INTEGRATION_TESTS_READY\n\n" + "**Rationale:** All deploy workflows and e2e methodology confirmed working.\n" + ) + assert is_integration_tests_ready(analysis) is True + + def test_summary_line_after_part_f_does_not_hijack_the_declared_field(self): + # Regression: a later "- Part F (Integration Readiness): ..." summary line + # (emitted by the skill after Part F) must not be read as the declaration. + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** **LOCAL_ONLY**\n\n" + "**Rationale:** No CI configured.\n\n" + "## DIMENSION STATUS\n" + "- Part F (Integration Readiness): INTEGRATION_TESTS_READY\n" + ) + assert is_integration_tests_ready(analysis) is False + + +class TestReadinessFieldIsAmbiguous: + """Malformed-but-attempted declarations must be flagged, not silently guessed.""" + + def test_paraphrased_value_is_ambiguous(self): + part_f = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** Not ready yet\n\n" + "**Rationale:** Does not currently meet the bar for INTEGRATION_TESTS_READY.\n" + ) + assert _readiness_field_is_ambiguous(part_f) is True + + def test_qualifier_word_before_token_is_ambiguous(self): + part_f = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** Currently LOCAL_ONLY due to missing CI\n\n" + "**Rationale:** Meets neither criteria for INTEGRATION_TESTS_READY.\n" + ) + assert _readiness_field_is_ambiguous(part_f) is True + + def test_no_label_at_all_is_not_ambiguous(self): + # Legitimate non-standard formatting (no labeled field at all) stays lenient. + assert _readiness_field_is_ambiguous("## Part F\n\nLOCAL_ONLY\n") is False + + def test_well_formed_field_is_not_ambiguous(self): + part_f = "## Part F\n\n**Integration Readiness:** **LOCAL_ONLY**\n" + assert _readiness_field_is_ambiguous(part_f) is False diff --git a/mcp_server/services/run_generation_precheck.py b/mcp_server/services/run_generation_precheck.py index 54d05e7..0cb2e90 100644 --- a/mcp_server/services/run_generation_precheck.py +++ b/mcp_server/services/run_generation_precheck.py @@ -53,10 +53,28 @@ class RejectionCode(StrEnum): # Part F section header pattern (case-insensitive, allows for variations in formatting). _PART_F_HEADER = re.compile(r"^#+\s*part\s*f\b", re.IGNORECASE | re.MULTILINE) +# Any "Part " heading — used to bound Part F to its own section instead of +# scanning to end-of-document (a later Part, or a summary line restating Part F, +# must not be read as the readiness declaration). +_PART_HEADER = re.compile(r"^#+\s*part\s*[a-z]\b", re.IGNORECASE | re.MULTILINE) _INTEGRATION_READY_TOKEN = re.compile( r"integration[_\s-]*tests?[_\s-]*ready", re.IGNORECASE ) +# The authoritative "**Integration Readiness:** " declaration. Anchored to a +# single line (^...$, MULTILINE) so a prose sentence that merely mentions the label +# — before or after the real field — can never be captured instead of it. +_READINESS_FIELD = re.compile( + r"^\s*\**\s*integration\s+readiness[\s*:]{1,12}" + r"(?:(?Pintegration[_\s-]*tests?[_\s-]*ready)" + r"|(?Plocal[_\s-]*only|not[_\s-]*ready))" + r"[\s*.]{0,6}$", + re.IGNORECASE | re.MULTILINE, +) +# Detects an attempted (but unparseable) declaration — present so callers can refuse +# rather than guess, instead of silently falling back to a whole-section token scan. +_READINESS_LABEL = re.compile(r"integration\s+readiness", re.IGNORECASE) + def _normalize_stem(filename: str) -> str: """Match backend contract_validator stem normalization (case + separators).""" @@ -95,16 +113,33 @@ def _find_required_md( def _part_f_section(analysis_text: str) -> str | None: + """Return the text of the Part F section only, or None if absent. + + Bounded at the next "Part " heading (or end of document if there is + none) so a later section — or a summary line elsewhere that merely restates + "Part F (Integration Readiness): ..." — is never scanned as part of Part F. + """ match = _PART_F_HEADER.search(analysis_text) if match is None: return None - return analysis_text[match.start() :] + next_part = _PART_HEADER.search(analysis_text, match.end()) + end = next_part.start() if next_part is not None else len(analysis_text) + return analysis_text[match.start():end] + + +def _readiness_field_is_ambiguous(part_f: str) -> bool: + """True if Part F attempts an "Integration Readiness" declaration that doesn't + parse to a recognized token — refuse rather than guess (see `precheck`).""" + return _READINESS_LABEL.search(part_f) is not None and _READINESS_FIELD.search(part_f) is None def _is_integration_tests_ready(analysis_text: str) -> bool: part_f = _part_f_section(analysis_text) if part_f is None: return False + field_match = _READINESS_FIELD.search(part_f) + if field_match is not None: + return field_match.group("ready") is not None return bool(_INTEGRATION_READY_TOKEN.search(part_f)) @@ -196,6 +231,18 @@ def precheck( ), ) + part_f = _part_f_section(analysis_text) + if part_f is not None and _readiness_field_is_ambiguous(part_f): + return Rejection( + code=RejectionCode.ANALYSIS_UNREADABLE, + message=( + f"Couldn't read integration readiness from `{SPEC_COMPLETENESS_FILE}`. " + f"Part F declares an Integration Readiness field, but its value isn't " + f"`INTEGRATION_TESTS_READY` or `LOCAL_ONLY`. Re-run " + f"`check_specification_completeness`." + ), + ) + if _is_integration_tests_ready(analysis_text): e2e_plan_file = _find_required_md(outputs_path, PLANNING_SUBDIR, E2E_TEST_PLAN_FILE) if e2e_plan_file is None: diff --git a/mcp_server/tests/test_run_generation_precheck.py b/mcp_server/tests/test_run_generation_precheck.py index fed7165..3f892f3 100644 --- a/mcp_server/tests/test_run_generation_precheck.py +++ b/mcp_server/tests/test_run_generation_precheck.py @@ -153,3 +153,63 @@ def test_integration_token_outside_part_f_does_not_require_e2e(self, tmp_path): _write_analysis(tmp_path, content=analysis) _write_plan(tmp_path) assert precheck(tmp_path, "specs", "docs") is None + + def test_integration_token_in_part_f_rationale_does_not_require_e2e(self, tmp_path): + """Real-world regression: Rationale names the other token while explaining + why it does NOT apply. The declared field (LOCAL_ONLY) must still win.""" + _write_spec(tmp_path) + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** **LOCAL_ONLY**\n\n" + "**Rationale:** No deploy workflow files or IaC, and no acceptance/e2e " + "test methodology. This meets neither criteria for `INTEGRATION_TESTS_READY`.\n" + ) + _write_analysis(tmp_path, content=analysis) + _write_plan(tmp_path) + assert precheck(tmp_path, "specs", "docs") is None + + def test_preamble_mention_does_not_hijack_the_declared_field(self, tmp_path): + """Regression: an earlier sentence naming 'integration readiness' (before the + real field) must not be captured in place of the actual declaration.""" + _write_spec(tmp_path) + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "Initial integration readiness: LOCAL_ONLY was assumed, but after " + "reviewing the CI workflows we upgraded the classification.\n\n" + "**Integration Readiness:** INTEGRATION_TESTS_READY\n\n" + "**Rationale:** All deploy workflows and e2e methodology confirmed working.\n" + ) + _write_analysis(tmp_path, content=analysis) + _write_plan(tmp_path) + _write_e2e_plan(tmp_path) + assert precheck(tmp_path, "specs", "docs") is None + + def test_summary_line_after_part_f_does_not_hijack_the_declared_field(self, tmp_path): + """Regression: a later '- Part F (Integration Readiness): ...' summary line + (emitted by the skill after Part F) must not be read as the declaration.""" + _write_spec(tmp_path) + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** **LOCAL_ONLY**\n\n" + "**Rationale:** No CI configured.\n\n" + "## DIMENSION STATUS\n" + "- Part F (Integration Readiness): INTEGRATION_TESTS_READY\n" + ) + _write_analysis(tmp_path, content=analysis) + _write_plan(tmp_path) + assert precheck(tmp_path, "specs", "docs") is None + + def test_analysis_unreadable_when_readiness_field_ambiguous(self, tmp_path): + """A Part F that attempts a declaration but doesn't parse must be refused, + not silently guessed via the whole-section fallback.""" + _write_spec(tmp_path) + analysis = ( + "## Part F: Integration & Deployment Readiness\n\n" + "**Integration Readiness:** Currently LOCAL_ONLY due to missing CI\n\n" + "**Rationale:** Meets neither criteria for INTEGRATION_TESTS_READY.\n" + ) + _write_analysis(tmp_path, content=analysis) + _write_plan(tmp_path) + result = precheck(tmp_path, "specs", "docs") + assert result is not None + assert result.code == RejectionCode.ANALYSIS_UNREADABLE