diff --git a/CoderMind/README.md b/CoderMind/README.md index 390dbf4..8a3b6be 100644 --- a/CoderMind/README.md +++ b/CoderMind/README.md @@ -69,7 +69,7 @@ Reverse Direction: Code → RPG │ │ │ │ (full) │ │ (manual │ │ └──────────────────┘ └────┬─────┘ │ fallback)│ │ rpg.json └──────────┘ │ - dep_graph.json rpg.json / dep_graph.json │ + (includes dep_graph) rpg.json │ │ │ └──────────────────────────────────────────┘ ▲ diff --git a/CoderMind/pyproject.toml b/CoderMind/pyproject.toml index a46b0e9..2b3e7cf 100644 --- a/CoderMind/pyproject.toml +++ b/CoderMind/pyproject.toml @@ -14,6 +14,16 @@ dependencies = [ "pytest", "tree-sitter", "tree-sitter-json", + # Tree-sitter grammars for the lang_parser module (Go / TS / JS / C / C++ / + # Rust). Installed by default so every language works out of the box; each + # is lazy-loaded in lang_parser/tree_sitter_backend.py, so if one grammar + # fails to import on a given platform only that language degrades. + "tree-sitter-go>=0.23.4", + "tree-sitter-typescript>=0.23.2", + "tree-sitter-javascript>=0.23.1", + "tree-sitter-c>=0.24.2", + "tree-sitter-cpp>=0.23.4", + "tree-sitter-rust>=0.24.2", "networkx", "rank_bm25", "rapidfuzz", diff --git a/CoderMind/scripts/build_data_flow.py b/CoderMind/scripts/build_data_flow.py index 7d141c8..452c060 100644 --- a/CoderMind/scripts/build_data_flow.py +++ b/CoderMind/scripts/build_data_flow.py @@ -38,6 +38,7 @@ # Import centralized paths from common.paths import SKELETON_FILE, DATA_FLOW_FILE, REPO_RPG_FILE from common import get_project_background_context +from common.language_meta import extract_language_metadata, metadata_with_languages # ============================================================================ @@ -139,6 +140,7 @@ def build(self, skeleton: Dict[str, Any]) -> Dict[str, Any]: """ # Get repository info repo_name, repo_info = get_repo_info_from_files() + primary_language, _ = extract_language_metadata(skeleton) # Enrich repo_info with project background / technology context project_background = get_project_background_context() @@ -185,7 +187,8 @@ def build(self, skeleton: Dict[str, Any]) -> Dict[str, Any]: max_iterations=self.max_iterations, logger=self.logger, trajectory=self.trajectory, - step_id=self._current_step_id + step_id=self._current_step_id, + target_language=primary_language, ) result = agent.build_data_flow( @@ -198,6 +201,7 @@ def build(self, skeleton: Dict[str, Any]) -> Dict[str, Any]: # Add components to result result["components"] = functional_areas + result["meta"] = metadata_with_languages(skeleton) # Update trajectory if self.trajectory and self._current_step_id: diff --git a/CoderMind/scripts/build_skeleton.py b/CoderMind/scripts/build_skeleton.py index d9b8554..8d748f9 100644 --- a/CoderMind/scripts/build_skeleton.py +++ b/CoderMind/scripts/build_skeleton.py @@ -33,6 +33,7 @@ REPO_RPG_FILE, ) from common import print_unicode_table +from common.language_meta import extract_language_metadata, metadata_with_languages from pathlib import Path as PPath from rpg import NodeMetaData from skeleton.skeleton_prompts import extract_features_from_subtree @@ -74,6 +75,12 @@ def convert_node(node): output = { "repository_name": rpg.repo_name, "repository_purpose": rpg.repo_info, + "meta": metadata_with_languages({ + "meta": { + "primary_language": getattr(rpg.repo_node.meta, "language", None) + if rpg.repo_node and rpg.repo_node.meta else None + } + }), "root": convert_node(skeleton.root), "statistics": { "total_components": len([n for n in rpg.nodes.values() if n.level == 1]), @@ -100,6 +107,7 @@ def __init__(self, max_iterations: int = 10, trajectory: Trajectory = None): # Build state self.repo_name = "" + self.target_language = None self.repo_data = {} self.rpg = None self.skeleton = None @@ -121,6 +129,7 @@ def build(self, input_data: Dict[str, Any]) -> Dict[str, Any]: """Execute complete skeleton building workflow.""" self.repo_data = input_data self.repo_name = input_data.get("repository_name", "project") + self.target_language = extract_language_metadata(input_data)[0] components = input_data.get("components", []) if not components: @@ -242,7 +251,8 @@ def _step2_file_design(self) -> bool: rpg=self.rpg, max_iterations=self.max_iterations, trajectory=self.trajectory, - step_id=self._current_step_id + step_id=self._current_step_id, + target_language=self.target_language, ) # Run file design process @@ -281,6 +291,12 @@ def _build_result(self) -> Dict[str, Any]: """Build the final result dictionary in CoderMind format.""" # Convert to CoderMind compatible format result = convert_skeleton_to_cmind_format(self.skeleton, self.rpg) + result["meta"] = metadata_with_languages({ + "meta": { + "primary_language": self.target_language, + "target_languages": [self.target_language] if self.target_language else [], + } + }) # Add statistics result["statistics"].update({ @@ -553,7 +569,7 @@ def patch_missing(input_data: Dict[str, Any]) -> Dict[str, Any]: json.dump(result, f, indent=2, ensure_ascii=False) rpg.save_json(str(REPO_RPG_FILE), indent=2) - print(f"\n[OK] Patch complete:") + print("\n[OK] Patch complete:") print(f" - Missing features patched: {total_missing}") print(f" - New files created: {new_file_count}") print(f" - Features merged into existing files: {merged_count}") diff --git a/CoderMind/scripts/check_base_classes.py b/CoderMind/scripts/check_base_classes.py index 91308f8..aed69b6 100644 --- a/CoderMind/scripts/check_base_classes.py +++ b/CoderMind/scripts/check_base_classes.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 """Check Base Classes Script. -Function: Validate base_classes.json state and validate Python syntax +Function: Validate base_classes.json state and target-language syntax - Checks if base_classes.json exists (init state) - Validates JSON structure (error state if invalid) -- Validates Python code syntax (error state if syntax errors) +- Validates source syntax (error state if syntax errors) - Returns update state if valid Input: .cmind/base_classes.json @@ -15,8 +15,9 @@ from pathlib import Path from typing import Dict, Any, List, Tuple -# Import from common utils -from common import validate_python_syntax, extract_class_names +from common.language_meta import extract_language_metadata +from decoder_lang import get_backend +from func_design.base_class_agent import extract_declaration_names # Import centralized paths from common.paths import BASE_CLASSES_FILE @@ -34,6 +35,7 @@ def load_json(file_path: Path) -> Dict[str, Any]: def validate_base_classes_structure(data: Dict[str, Any]) -> Tuple[bool, List[str]]: """Validate base classes structure.""" errors = [] + backend = get_backend(extract_language_metadata(data)[0]) base_classes = data.get("base_classes", []) @@ -53,16 +55,15 @@ def validate_base_classes_structure(data: Dict[str, Any]) -> Tuple[bool, List[st elif not bc[field]: errors.append(f"Base class {i}: field '{field}' is empty") - # Validate Python syntax code = bc.get("code", "") if code: - is_valid, error = validate_python_syntax(code) + is_valid, error = backend.syntax_check(code, bc.get("file_path", "")) if not is_valid: # Try to get name from bc or extract from code name = bc.get("name", "") if not name: - class_names = extract_class_names(code) - name = class_names[0] if class_names else "unknown" + declarations = extract_declaration_names(code, backend) + name = declarations[0] if declarations else "unknown" errors.append(f"Base class {i} ({name}): syntax error - {error}") # Also validate data_structures if present @@ -94,11 +95,13 @@ def validate_base_classes_structure(data: Dict[str, Any]) -> Tuple[bool, List[st code = ds.get("code", "") if code: - is_valid, error = validate_python_syntax(code) + is_valid, error = backend.syntax_check( + code, + ds.get("file_path", f"data_structure{backend.file_extension}"), + ) if not is_valid: - name = "" - class_names = extract_class_names(code) - name = class_names[0] if class_names else "unknown" + declarations = extract_declaration_names(code, backend) + name = declarations[0] if declarations else "unknown" errors.append(f"Data structure {i} ({name}): syntax error - {error}") return len(errors) == 0, errors @@ -172,6 +175,7 @@ def inspect_state(base_classes_path: Path) -> Dict[str, Any]: "data_structure_names": ds_class_names, "data_structure_subtrees": ds_subtrees, "data_structure_file_paths": ds_file_paths, + "language": extract_language_metadata(data)[0], } } diff --git a/CoderMind/scripts/check_code_gen.py b/CoderMind/scripts/check_code_gen.py index 16d3df6..f2165ec 100644 --- a/CoderMind/scripts/check_code_gen.py +++ b/CoderMind/scripts/check_code_gen.py @@ -362,16 +362,53 @@ def determine_state( # actually generating the expected files. missing_artifacts = [] repo_root = REPO_DIR - - # Check for main_entry task artifact + + # Resolve the target language so entry-point / dependency artifact + # checks are not hard-coded to Python's ``main.py`` / + # ``requirements.txt``. Routes through the canonical repo resolver so + # the language is inferred from the real on-disk sources when the rpg + # metadata is missing, rather than silently assuming Python. Falls + # back to Python on any failure so the check degrades to its previous + # behaviour rather than crashing. + backend = None + try: + from common.paths import REPO_RPG_FILE + from decoder_lang import resolve_repo_backend + rpg_obj = None + if Path(REPO_RPG_FILE).is_file(): + rpg_obj = json.loads(Path(REPO_RPG_FILE).read_text(encoding="utf-8")) + backend = resolve_repo_backend(repo_root, rpg_obj=rpg_obj) + except Exception: # noqa: BLE001 — degraded mode: assume Python + backend = None + + # Check for main_entry task artifact (language-aware entry path). main_entry_ids = [tid for tid in completed_ids if tid.startswith("")] - if main_entry_ids and not (repo_root / "main.py").exists(): - missing_artifacts.append("main.py (from task)") - - # Check for requirements task artifact + if main_entry_ids: + if backend is not None: + # Accept any of the backend's entry-point shapes. A single + # canonical path is too strict when the skeleton placed the + # entry off-canonical (e.g. C++ ``src/cli/main.cpp``) or the + # language uses a glob convention (Go ``cmd/*/main.go``). + candidates = backend.entry_point_candidates() + entry_exists = any( + (any(repo_root.glob(c)) if "*" in c else (repo_root / c).exists()) + for c in candidates + ) + if not entry_exists: + missing_artifacts.append( + f"{candidates[0]} (from task)" + ) + elif not (repo_root / "main.py").exists(): + missing_artifacts.append("main.py (from task)") + + # Check for requirements task artifact. The dependency-manifest + # filename is language-specific; only Python's is asserted here + # (other languages manage deps via go.mod / Cargo.toml / package.json + # which the dependency task and build steps validate separately). req_ids = [tid for tid in completed_ids if tid.startswith("")] - if req_ids and not (repo_root / "requirements.txt").exists(): - missing_artifacts.append("requirements.txt (from task)") + if req_ids and (backend is None or backend.name == "python"): + if not (repo_root / "requirements.txt").exists(): + missing_artifacts.append("requirements.txt (from task)") if missing_artifacts: result["type"] = "incomplete" diff --git a/CoderMind/scripts/code_gen/batch_prompts.py b/CoderMind/scripts/code_gen/batch_prompts.py index 4a74fb2..a39b403 100644 --- a/CoderMind/scripts/code_gen/batch_prompts.py +++ b/CoderMind/scripts/code_gen/batch_prompts.py @@ -17,7 +17,9 @@ from __future__ import annotations +import json import logging +import shlex import shutil import sys from pathlib import Path @@ -27,6 +29,7 @@ from common.import_normalizer import build_import_convention_snippet from common.paths import ( CODE_GEN_STATE_FILE as STATE_FILE, + FEATURE_SPEC_FILE, REPO_RPG_FILE, TASKS_FILE, get_scripts_dir, @@ -43,6 +46,13 @@ get_dev_python, get_dev_venv_path, ) +from decoder_lang import ( + EnvHandle, + LanguageBackend, + ToolchainUnavailable, + get_backend, + resolve_decoder_language, +) logger = logging.getLogger(__name__) @@ -53,6 +63,15 @@ # generated prompt; not used to drive any Python-side loop). MAX_ITERATIONS = 5 +_FALLBACK_TEST_COMMANDS = { + "go": ["go", "test", "-v", "./..."], + "rust": ["cargo", "test"], + "typescript": ["npm", "test"], + "javascript": ["npm", "test"], + "c": ["make", "test"], + "cpp": ["ctest", "--output-on-failure"], +} + # ============================================================================ # Prompt Templates @@ -81,7 +100,7 @@ HTML with CSS classes, returns data structures), read those consuming modules to ensure compatibility. - Read existing test files in `tests/` to understand conventions. -- Read `requirements.txt` if it exists. +- {dependency_manifest_instruction} - **UI/View code quality:** If you are implementing code that generates HTML, renders pages, produces visual output, or defines styles/CSS: - Ensure all HTML pages use the shared layout (head, nav, footer) consistently @@ -132,14 +151,14 @@ clearly wrong based on the skeleton). ### Step 5: Analyze & Fix (if tests fail) -- Read the FULL pytest output carefully. +- Read the FULL {test_tool_name} output carefully. - Determine root cause: test bug, code bug, import error, or dependency issue. - Fix the appropriate file(s). You MAY fix: - Test files (wrong assertions, bad mocks, missing imports) - Source files (logic bugs, missing methods, wrong signatures) - - Other project files (broken imports, missing `__init__.py`) - - requirements.txt (missing third-party package) -- After fixing, re-run the EXACT SAME pytest command from Step 4. + - Other project files (broken imports, missing package markers) + - Dependency manifests (missing third-party package or module) +- After fixing, re-run the EXACT SAME test command from Step 4. ### Step 6: Repeat Steps 4–5 - Maximum **{max_iterations} iterations** of test → fix → test. @@ -176,78 +195,78 @@ runner can verify your claim: ``` -PYTEST_SUMMARY: +PYTEST_SUMMARY: BATCH_RESULT: PASS ``` or on failure: ``` -PYTEST_SUMMARY: +PYTEST_SUMMARY: BATCH_RESULT: FAIL | ``` -The `PYTEST_SUMMARY` line must be the *literal* one-line summary that -pytest printed, e.g. `5 passed in 0.42s`, `2 passed, 1 failed in 1.30s`, -`1 failed, 1 error in 0.55s`. Copy it verbatim from the run you just -performed; do NOT invent it. This lets the runner cross-check your -claim against an independent re-run. +The `PYTEST_SUMMARY` marker name is kept for runner compatibility. Its +value must be the *literal* one-line summary printed by the test command, +for example `5 passed in 0.42s`, `ok ./...`, or `test result: ok`. Copy it +verbatim from the run you just performed; do NOT invent it. This lets the +runner cross-check your claim against an independent re-run. ## ── Capabilities ───────────────────────────────────────── [OK] You CAN: - Read/write any file under `src/`, `tests/`, `static/`, `templates/`, and `examples/` - (Python, HTML, CSS, JavaScript, JSON, YAML, config files, etc.) + (source files in the target language, plus HTML, CSS, JSON, YAML, config files, etc.) - Create new directories and files if needed (e.g., `static/css/`, `templates/`) - Read any file in the repo for context - Run: `{pytest_cmd}` (this exact command only) -- Run: `{pip_install_cmd} install ` to install missing packages -- Update `requirements.txt` when adding new dependencies +{dependency_install_capability} - Fix import errors in ANY source file (not just the target) - Run: `git add -A && git commit -m ""` [FAIL] You MUST NOT: - Modify or read files under `.cmind/` - Run any `cmind script ...` or `cmind-mcp` commands -- Run arbitrary shell commands beyond pytest/pip/git listed above +- Run arbitrary shell commands beyond the test/dependency/git commands listed above - Install packages that are not genuinely needed by the source code - Delete files that are not part of your task -- Run pytest without `--timeout` flag (already included in the command) +{test_timeout_rule} -## ── Pytest Rules (CRITICAL) ────────────────────────────────── +## ── Test Command Rules (CRITICAL) ───────────────────────────── -1. **Always use the EXACT pytest command provided** — it has timeout flags - to prevent hanging tests. -2. **Do not manually run a different pytest command** — the provided command - already targets the correct test files for this batch. +1. **Always use the EXACT {test_tool_name} command provided**. +2. **Do not manually run a different test command** — the provided command + already targets the correct test scope for this batch. 3. If a test times out or hangs, the test is wrong. Fix the test: - - Remove infinite loops, blocking I/O, or `time.sleep()` calls + - Remove infinite loops, blocking I/O, or real-time sleeps/waits - Mock any external resources (network, filesystem, GPU) - Ensure all fixtures have finite setup/teardown 4. **Do not write tests that depend on timing** (real-time waits). - Use mocks or `unittest.mock.patch` for time-dependent behavior. + Mock time-dependent behavior with your target language's test/mocking + framework (see the Target Language section below). 5. **Do not write tests that spawn subprocesses or servers.** -6. **Output control:** Use `-x` (stop at first failure) and `--tb=short` - to keep output manageable. Focus on the FIRST failure. +6. **Output control:** prefer fail-fast and concise tracebacks so the + FIRST failure stays the focus; follow the exact test command provided. ## ── Test Quality Rules ─────────────────────────────────── -- Use `MagicMock(spec=RealClass)` or `create_autospec()`, never bare `MagicMock()`. -- For numeric/math operations: use real values (`np.array(...)`, `4.0`), not mocks. -- Mock at boundaries (I/O, external deps), not internal implementation. +- Use spec'd / auto-generated mocks bound to a real type, never an + unconstrained stand-in, and mock at boundaries (I/O, external deps), + not internal implementation. Use your target language's idiomatic + mocking facility. +- For numeric/math operations: use real values, not mocks. - Keep tests deterministic — no random data without fixed seeds. - Test count: proportional to task complexity. Small task = 3–8 tests. Do NOT over-engineer with 20+ tests for a simple class. ## ── Dependency Management ──────────────────────────────── -When you encounter `ModuleNotFoundError` or `ImportError` for a third-party package: -1. Install it: `{pip_install_cmd} install ` -2. Verify by re-running pytest. -3. Append the package to `requirements.txt` (create the file if it doesn't exist). +{dependency_management} {import_convention} +{language_context} + ## ── Project Context ────────────────────────────────────── {dependency_context} @@ -269,15 +288,15 @@ **Attempt:** {attempt_number} **Failure reason:** {failure_reason} {post_verify_section} -## Previous Test Output (last pytest run) +## Previous Test Output (last test-command run) ``` {last_test_output} ``` ## Instructions 1. Review what has already been written (read modified files). -2. Run the pytest command to see current status. -3. If tests fail → fix the **production code** first, then re-run pytest. +2. Run the exact test command to see current status. +3. If tests fail → fix the **production code** first, then re-run the exact test command. 4. **Do NOT silence failures by editing tests** — the tests in `tests/` describe the contract. Only modify a test if you can show it is logically wrong (wrong expected value, wrong fixture, etc.) and @@ -287,13 +306,14 @@ ## Exit Protocol (same as the original task) The final two lines of your response MUST be: ``` -PYTEST_SUMMARY: +PYTEST_SUMMARY: BATCH_RESULT: PASS # or FAIL | ``` -The `PYTEST_SUMMARY` must be copied verbatim from your pytest run. +The `PYTEST_SUMMARY` marker is kept for runner compatibility. Copy the +last summary line from your test-command run verbatim. All other rules from the original task apply (capabilities, constraints, -pytest rules, etc). The full original task is included below. +test-command rules, etc). The full original task is included below. """ TDD_PROJECT_FILE_PREAMBLE = """\ @@ -356,6 +376,170 @@ # Builder functions # ============================================================================ +def _load_json_if_exists(path: Path) -> Any: + """Load JSON from ``path`` or return None when unavailable.""" + if not path.exists(): + return None + try: + with open(path, "r", encoding="utf-8") as file: + return json.load(file) + except (OSError, json.JSONDecodeError): + return None + + +def _resolve_codegen_backend() -> LanguageBackend: + """Resolve the target language backend for code generation.""" + feature_spec = _load_json_if_exists(FEATURE_SPEC_FILE) + rpg_obj = _load_json_if_exists(REPO_RPG_FILE) + language = resolve_decoder_language(feature_spec=feature_spec, rpg_obj=rpg_obj) + return get_backend(language) + + +def _shell_join(argv: List[str]) -> str: + """Return a shell-safe command string for display in prompts.""" + return shlex.join([str(part) for part in argv]) + + +def _fallback_test_command(backend: LanguageBackend) -> List[str]: + """Return a stable test command when host tool detection is unavailable.""" + return list(_FALLBACK_TEST_COMMANDS.get(backend.name, [backend.prompt_hints().test_framework_name])) + + +def _build_backend_test_cmd( + backend: LanguageBackend, + repo_path: Path, + test_files: List[str], + venv_python: str, +) -> str: + """Build the exact test command the codegen agent should run.""" + if backend.name == "python": + return build_batch_pytest_cmd(test_files, venv_python) + + env = backend.detect_env(repo_path) or EnvHandle(project_root=repo_path.resolve()) + try: + return _shell_join(backend.test_command(env)) + except (ToolchainUnavailable, NotImplementedError, OSError): + return _shell_join(_fallback_test_command(backend)) + + +def _dependency_manifest_instruction(backend: LanguageBackend) -> str: + """Return the dependency manifest reading instruction for the backend.""" + manifest_by_language = { + "python": "Read `requirements.txt` if it exists.", + "go": "Read `go.mod` if it exists.", + "rust": "Read `Cargo.toml` if it exists.", + "typescript": "Read `package.json` and `tsconfig.json` if they exist.", + "javascript": "Read `package.json` if it exists.", + "c": "Read `Makefile` if it exists.", + "cpp": "Read `CMakeLists.txt` or `Makefile` if they exist.", + } + return manifest_by_language.get(backend.name, "Read the project's dependency manifest if it exists.") + + +def _dependency_install_capability(backend: LanguageBackend, repo_path: Path) -> str: + """Return the allowed dependency-install command bullet.""" + if backend.name == "python": + return f"- Run: `{_build_pip_install_cmd(repo_path)} install ` to install missing packages\n- Update `requirements.txt` when adding new dependencies" + capability_by_language = { + "go": "- Run: `go get ` only when a non-standard module is genuinely required\n- Update `go.mod` / `go.sum` when adding dependencies", + "rust": "- Run: `cargo add ` only when a crate is genuinely required\n- Update `Cargo.toml` / `Cargo.lock` when adding dependencies", + "typescript": "- Run: `npm install ` only when a package is genuinely required\n- Update `package.json` / lockfiles when adding dependencies", + "javascript": "- Run: `npm install ` only when a package is genuinely required\n- Update `package.json` / lockfiles when adding dependencies", + "c": "- Prefer the C standard library; do not install system packages from this workflow\n- Update `Makefile` when build flags or source lists change", + "cpp": "- Prefer the C++ standard library; do not install system packages from this workflow\n- Update `CMakeLists.txt` or `Makefile` when build flags or source lists change", + } + return capability_by_language.get( + backend.name, + "- Use the project's native dependency tool only when a dependency is genuinely required", + ) + + +def _dependency_management_text(backend: LanguageBackend, repo_path: Path) -> str: + """Return dependency-management instructions for the target language.""" + if backend.name == "python": + pip_cmd = _build_pip_install_cmd(repo_path) + return ( + "When you encounter `ModuleNotFoundError` or `ImportError` for a third-party package:\n" + f"1. Install it: `{pip_cmd} install `\n" + "2. Verify by re-running the exact test command.\n" + "3. Append the package to `requirements.txt` (create the file if it doesn't exist)." + ) + management_by_language = { + "go": ( + "When a non-standard Go module is genuinely needed:\n" + "1. Run `go get `.\n" + "2. Verify by re-running the exact test command.\n" + "3. Keep `go.mod` and `go.sum` consistent." + ), + "rust": ( + "When an external Rust crate is genuinely needed:\n" + "1. Run `cargo add `.\n" + "2. Verify by re-running the exact test command.\n" + "3. Keep `Cargo.toml` and `Cargo.lock` consistent." + ), + "typescript": ( + "When an npm package is genuinely needed:\n" + "1. Run `npm install `.\n" + "2. Verify by re-running the exact test command.\n" + "3. Keep `package.json` and lockfiles consistent." + ), + "javascript": ( + "When an npm package is genuinely needed:\n" + "1. Run `npm install `.\n" + "2. Verify by re-running the exact test command.\n" + "3. Keep `package.json` and lockfiles consistent." + ), + "c": "Prefer the C standard library. Do not add system dependencies unless the repository already documents them.", + "cpp": "Prefer the C++ standard library. Do not add system dependencies unless the repository already documents them.", + } + return management_by_language.get( + backend.name, + "Use the project's native dependency workflow and re-run the exact test command after changes.", + ) + + +def _test_timeout_rule(backend: LanguageBackend) -> str: + """Return a timeout-safety rule tailored to the test command.""" + if backend.name == "python": + return "- Run pytest without `--timeout` flag (already included in the command)" + return "- Run long-lived servers, watchers, or interactive commands instead of the exact test command" + + +def _build_language_context(backend: LanguageBackend, test_command: str) -> str: + """Build the target-language prompt section.""" + hints = backend.prompt_hints() + context = ( + "## ── Target Language ─────────────────────────────────────\n" + f"- Language: {hints.display_name}\n" + f"- Source extension: `{hints.file_extension}`\n" + f"- Code fences: ```{hints.markdown_fence}\n" + f"- Test command: `{test_command}`\n" + f"- Test framework/tool: {hints.test_framework_name}\n" + f"- Module naming: {hints.module_naming_rule}\n" + f"- Style: {hints.style_directive}\n" + ) + if backend.name != "python": + # The decoder's defaults are Python-centric; without an explicit + # prohibition the sub-agent tends to add Python helpers (a main.py + # launcher wrapper, a pytest conftest.py to drive native tests, a + # requirements.txt). Forbid them outright so the generated repo stays + # a pure single-language project. + context += ( + f"- **This is a {hints.display_name} project, NOT Python.** Every source and test " + f"file you create MUST use `{hints.file_extension}` (or the language's own test " + "suffix). Do NOT create ANY `.py` file.\n" + "- Specifically FORBIDDEN: `main.py` or any Python launcher/wrapper, `conftest.py`, " + "`pytest.ini`, `setup.py`, `pyproject.toml`, `requirements.txt`, `__init__.py`, or a " + "`.venv`/pip workflow.\n" + f"- Run tests ONLY with `{test_command}` ({hints.test_framework_name}). Do NOT wrap, " + "re-implement, or drive the test suite through pytest or any Python script.\n" + ) + else: + context += ( + "- Do NOT introduce Python-specific files, packages, or pytest conventions unless this is a Python project.\n" + ) + return context + def build_batch_pytest_cmd( test_files: List[str], venv_python: str, @@ -408,37 +592,55 @@ def _build_api_summary(repo_path: Path, source_files: List[str], max_chars: int Returns: Formatted string of file → class/function signatures. """ - import ast as _ast - + # Resolve the project's actual backend so signatures are extracted from + # the right language. Python keeps a precise AST rendering (bare argument + # names + return annotation); every other language uses the backend's own + # one-line ``format_signature`` so non-Python test-writing batches still + # receive real API context instead of nothing. + import ast as _ast # local import; only used for unparse(returns) + + backend = _resolve_codegen_backend() + is_python = backend.name == "python" summaries = [] for filepath in sorted(source_files): full_path = repo_path / filepath - if not full_path.exists() or full_path.suffix != '.py': + if not full_path.exists() or not backend.is_source_file(filepath): continue try: - tree = _ast.parse(full_path.read_text(encoding='utf-8')) - except (SyntaxError, UnicodeDecodeError): + source = full_path.read_text(encoding='utf-8') + except (OSError, UnicodeDecodeError): continue + units = backend.list_code_units(source, filepath) + # Walk top-level declarations only (parent is None) and, for + # classes, list direct public methods. The prompt format keeps + # bare argument names plus return annotations. + top_level = [u for u in units if u.parent is None] file_sigs = [] - for node in tree.body: - if isinstance(node, _ast.ClassDef): - if node.name.startswith('_'): - continue + for unit in top_level: + if not unit.name or unit.name.startswith('_'): + continue + if unit.unit_type == 'class': methods = [ - n.name for n in node.body - if isinstance(n, (_ast.FunctionDef, _ast.AsyncFunctionDef)) - and not n.name.startswith('_') + u.name for u in units + if u.unit_type == 'method' and u.parent == unit.name + and not u.name.startswith('_') ] methods_str = ', '.join(methods) if methods else '(dataclass)' - file_sigs.append(f" class {node.name}: {methods_str}") - elif isinstance(node, (_ast.FunctionDef, _ast.AsyncFunctionDef)): - if node.name.startswith('_'): - continue - args = [a.arg for a in node.args.args if a.arg != 'self'] - ret = _ast.unparse(node.returns) if node.returns else '' - ret_str = f" -> {ret}" if ret else "" - file_sigs.append(f" def {node.name}({', '.join(args)}){ret_str}") + file_sigs.append(f" class {unit.name}: {methods_str}") + elif unit.unit_type == 'function': + if is_python: + node = (unit.extra or {}).get('ast_node') + if node is None: + continue + args = [a.arg for a in node.args.args if a.arg != 'self'] + ret = _ast.unparse(node.returns) if node.returns else '' + ret_str = f" -> {ret}" if ret else "" + file_sigs.append(f" def {unit.name}({', '.join(args)}){ret_str}") + else: + # Non-Python: use the backend's own signature renderer. + sig = backend.format_signature(unit) or unit.name + file_sigs.append(f" {sig}") if file_sigs: summaries.append(f"# {filepath}\n" + "\n".join(file_sigs)) @@ -555,8 +757,13 @@ def build_tdd_prompt( Returns: Complete prompt string ready for LLMClient.generate(). """ + backend = _resolve_codegen_backend() venv_python = get_dev_python(repo_path) or "python3" - import_convention = build_import_convention_snippet(repo_path=repo_path) + import_convention = ( + build_import_convention_snippet(repo_path=repo_path) + if backend.name == "python" + else "" + ) # --- Project docs: simplest path --- if is_project_docs_batch(task): @@ -579,8 +786,11 @@ def build_tdd_prompt( test_files = [] else: test_files = find_related_test_files(task.file_path, repo_path) - pytest_cmd = build_batch_pytest_cmd(test_files, venv_python) - pip_cmd = _build_pip_install_cmd(repo_path) + pytest_cmd = _build_backend_test_cmd(backend, repo_path, test_files, venv_python) + + # Language-aware entry point reference so testing-batch guidance never + # plants a Python file name (e.g. "main.py") in a non-Python project. + entry_point = backend.prompt_hints().entrypoint_example or "the main entry point" # For testing batches, allow fixing genuine integration bugs if task.task_type in ("integration_test", "final_test_docs"): @@ -594,11 +804,12 @@ def build_tdd_prompt( "- Data format mismatch at a module boundary\n\n" "Do NOT modify production code solely to make a poorly-written test pass.\n" "The test should reflect correct behavior; the code should implement it.\n" - "Do NOT create main.py — it will be created in a later task.\n\n" + f"Do NOT create the entry point ({entry_point}) — it will be created in a later task.\n\n" "**Testing strategy for efficiency:**\n" - "- After the first full pytest run, use `--last-failed` on subsequent runs " - "to only re-run failing tests. This saves time.\n" - "- Only run a full pytest at the very end to confirm everything passes.\n" + "- After the first full test-command run, use the native tool's " + "focused rerun option when available. This saves time.\n" + "- Only run the full provided test command at the very end to " + "confirm everything passes.\n" ) else: code_instructions = batch_state.code_prompt @@ -660,10 +871,15 @@ def build_tdd_prompt( test_instructions=batch_state.test_prompt, code_instructions=code_instructions, pytest_cmd=pytest_cmd, + test_tool_name=backend.prompt_hints().test_framework_name, max_iterations=MAX_ITERATIONS, batch_id=batch_state.batch_id, - pip_install_cmd=pip_cmd, + dependency_manifest_instruction=_dependency_manifest_instruction(backend), + dependency_install_capability=_dependency_install_capability(backend, repo_path), + dependency_management=_dependency_management_text(backend, repo_path), + test_timeout_rule=_test_timeout_rule(backend), import_convention=import_convention, + language_context=_build_language_context(backend, pytest_cmd), dependency_context=dep_ctx_str, file_path=task.file_path, units=", ".join(task.units_key), @@ -688,7 +904,7 @@ def build_resume_prompt( failure_reason: One-line reason from BATCH_RESULT: FAIL, or the post-verify mismatch reason if the sub-agent self-reported PASS but verification failed. - last_test_output: pytest output from post-verification. + last_test_output: Test-command output from post-verification. sub_agent_claimed_pass: True if the previous attempt reported ``BATCH_RESULT: PASS`` but post-verify rejected it; this triggers an extra warning section in the prompt so the @@ -714,11 +930,11 @@ def build_resume_prompt( post_verify_section = ( "\n\n## ⚠ False-positive PASS detected\n" "Your previous attempt ended with `BATCH_RESULT: PASS` and the\n" - f"PYTEST_SUMMARY line {agent_summary_repr}, but the runner's\n" - "independent pytest re-run reported the failure shown below.\n" + "PYTEST_SUMMARY line {agent_summary_repr}, but the runner's\n" + "independent test-command re-run reported the failure shown below.\n" "Possible causes you must investigate:\n" - "* You did not actually run pytest before declaring PASS.\n" - "* You ran pytest with `--no-cov` / `-k` / a different path that\n" + "* You did not actually run the exact test command before declaring PASS.\n" + "* You ran a different command or selector that\n" " excluded the failing tests.\n" "* You modified or deleted tests instead of fixing production code.\n" "* Your local changes were not committed before the runner verified.\n" diff --git a/CoderMind/scripts/code_gen/context_collector.py b/CoderMind/scripts/code_gen/context_collector.py index 5489323..bef7fa7 100644 --- a/CoderMind/scripts/code_gen/context_collector.py +++ b/CoderMind/scripts/code_gen/context_collector.py @@ -21,7 +21,9 @@ ensure_future_annotations, fix_missing_stdlib_imports, ) +from common.language_meta import extract_language_metadata from common.utils import get_project_background_context +from decoder_lang import get_backend if TYPE_CHECKING: from common.task_batch import PlannedTask @@ -69,12 +71,14 @@ def write_interface_skeletons( return result subtrees = interfaces.get("subtrees", {}) + primary_language, _ = extract_language_metadata(interfaces) + backend = get_backend(primary_language) - # Detect import prefix from file paths in interfaces.json. - # If file paths start with "src/", imports should use "src." prefix. - import_prefix = detect_project_import_prefix( - interfaces_subtrees=subtrees, - ) + import_prefix = "" + if backend.name == "python": + import_prefix = detect_project_import_prefix( + interfaces_subtrees=subtrees, + ) for _subtree_name, subtree_data in subtrees.items(): file_interfaces = subtree_data.get("interfaces", {}) @@ -83,15 +87,12 @@ def write_interface_skeletons( if not file_code or not file_code.strip(): continue - # Normalize import prefixes before writing - if import_prefix: + if backend.name == "python" and import_prefix: file_code = normalize_code(file_code, import_prefix) - # Add from __future__ import annotations to prevent forward ref errors - file_code = ensure_future_annotations(file_code) - - # Fix missing stdlib imports (dataclass, Callable, etc.) - file_code = fix_missing_stdlib_imports(file_code) + if backend.name == "python": + file_code = ensure_future_annotations(file_code) + file_code = fix_missing_stdlib_imports(file_code) full_path = repo_path / file_path if full_path.exists(): diff --git a/CoderMind/scripts/code_gen/final_validation.py b/CoderMind/scripts/code_gen/final_validation.py index fccb202..b3edf9e 100644 --- a/CoderMind/scripts/code_gen/final_validation.py +++ b/CoderMind/scripts/code_gen/final_validation.py @@ -26,13 +26,15 @@ from common.git_utils import GitRunner from common.paths import CODE_GEN_STATE_FILE as STATE_FILE, REPO_DIR +from code_gen.batch_prompts import _build_backend_test_cmd from code_gen.git_ops import ensure_on_main from code_gen.stage_io import save_stage_result from code_gen.sub_agent import dispatch_sub_agent from code_gen.test_runner import ( ensure_deps_installed, get_dev_python, - run_pytest, + resolve_test_backend, + run_project_tests, ) logger = logging.getLogger(__name__) @@ -47,12 +49,17 @@ def final_test( repo_path: Optional[Path] = None, state_path: Path = STATE_FILE, + max_repair_iters: int = 2, ) -> Dict[str, Any]: """Run the full test suite against the completed repo. Args: repo_path: Project repo path. state_path: Path to state file. + max_repair_iters: Bound on repair sub-agent passes when the full + suite fails. Cross-batch inconsistencies (e.g. a test asserting the + README documents a symbol another batch produced) only surface here, + where no per-batch TDD loop can catch them. Returns: Result dict with test statistics. @@ -67,22 +74,135 @@ def final_test( except RuntimeError as exc: return {"success": False, "error": str(exc)} - # Ensure all deps - try: - ensure_deps_installed(repo_path) - except Exception as exc: - logger.warning("Dependency install issue: %s", exc) + backend = resolve_test_backend(repo_path=repo_path) + if backend.name == "python": + try: + ensure_deps_installed(repo_path) + except Exception as exc: + logger.warning("Dependency install issue: %s", exc) # Run full test suite - result = run_pytest( + result = run_project_tests( repo_path, timeout=DEFAULT_PYTEST_OVERALL_TIMEOUT, extra_args=[ "-v", "--tb=short", f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread", ], + backend=backend, ) + # Guard against a no-op "pass": a verification gate that executed zero + # tests is not a pass, it is a non-result (e.g. ``go test ./...`` matching + # no packages, or the runner invoked before sources were in the tree). + # The backend already reports this as a non-success "errored" status; here + # we make the final gate fail loudly with a precise diagnostic instead of + # dispatching a code-repair agent that cannot fix a "no tests ran" state. + executed = result.passed + result.failed + result.errors + result.skipped + if not result.success and executed == 0: + # A toolchain/infra failure (missing tool, timeout, crash → + # return_code -1) is a different non-result than a command that ran + # cleanly (exit 0) yet collected zero tests. Neither is a pass and + # neither is fixable by a code-repair agent, but they need different + # diagnostics, so report them distinctly. + toolchain_failure = result.return_code != 0 + if toolchain_failure: + next_action = ( + f"Final test could not run the {backend.display_name} test " + "command (toolchain unavailable, timeout, or crash). Install or " + "repair the language toolchain and re-run — this is an " + "environment problem, not a code defect." + ) + else: + next_action = ( + f"Final test ran the {backend.display_name} test command but " + "no tests executed (zero collected). This is a verification " + "no-op, not a pass: confirm the generated test suite is present " + "on the main branch and the test command discovers it." + ) + logger.error( + "Final test executed zero tests for %s backend (return_code=%s) — " + "treating as a verification failure, not a pass.", + backend.name, result.return_code, + ) + no_test_result = { + "success": False, + "type": "final_test", + "passed": 0, + "failed": 0, + "errors": 0, + "skipped": 0, + "duration": result.duration, + "output": result.output[:5000], + "no_tests_executed": not toolchain_failure, + "toolchain_unavailable": toolchain_failure, + "next_action": next_action, + } + save_stage_result("final_test", { + "success": False, + "passed": 0, + "failed": 0, + "errors": 0, + "no_tests_executed": not toolchain_failure, + "toolchain_unavailable": toolchain_failure, + "output_tail": "\n".join(result.output.splitlines()[-40:]), + }) + return no_test_result + + # Repair loop for full-suite failures. The per-batch TDD loop only sees one + # file's tests at a time, so cross-file consistency gaps (a test asserting + # the README / an example module documents a specific symbol or section that + # a different batch generated independently) survive to here. Dispatch a + # bounded repair pass that reconciles the repo against the EXISTING tests + # rather than letting one such gap fail the whole stage with no recovery. + repair_attempts = 0 + while not result.success and repair_attempts < max_repair_iters: + repair_attempts += 1 + venv_python = get_dev_python(repo_path) or "python3" + repair_verify_cmd = _build_backend_test_cmd( + backend, repo_path, [], venv_python, + ) + failure_tail = "\n".join(result.output.splitlines()[-80:]) + repair_prompt = ( + "The full test suite failed after every batch completed. Reconcile " + "the repository so the EXISTING tests pass. These failures are " + "usually cross-file consistency gaps — for example a test asserts " + "that the README or an example module documents a specific symbol " + "or section, but a different batch generated those files " + "independently.\n\n" + f"Failing test output (tail):\n{failure_tail}\n\n" + "Rules:\n" + "- Fix production code, documentation, or example files so the " + "existing tests pass. Do NOT delete, skip, or weaken any test.\n" + "- Do NOT create new test files.\n\n" + f"Verify with:\n```\n{repair_verify_cmd}\n```\n\n" + "When the suite is green, commit:\n" + "```\ngit add -A && git commit -m " + '"fix: reconcile final test failures"\n```\n' + "Then output: BATCH_RESULT: PASS" + ) + logger.info( + "Final test failed; dispatching repair agent (attempt %d/%d)", + repair_attempts, max_repair_iters, + ) + response, error = dispatch_sub_agent( + repair_prompt, repo_path, timeout=1800, + purpose="final_test_repair", + ) + if not response: + logger.warning("Final-test repair agent failed: %s", error) + break + ensure_on_main(git) + result = run_project_tests( + repo_path, + timeout=DEFAULT_PYTEST_OVERALL_TIMEOUT, + extra_args=[ + "-v", "--tb=short", + f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread", + ], + backend=backend, + ) + result_dict = { "success": result.success, "type": "final_test", @@ -99,6 +219,9 @@ def final_test( f"Review the output above and fix remaining issues." ), } + if repair_attempts: + result_dict["final_test_repair_attempts"] = repair_attempts + result_dict["final_test_repaired"] = result.success # After pytest passes, run smoke test and attempt repair if issues found if result.success: @@ -106,7 +229,6 @@ def final_test( # Lazy import: smoke_test pulls in the dep_graph stack, so only # load it on the success path where we actually need it. from smoke_test import run_smoke_test - from code_gen.batch_prompts import build_batch_pytest_cmd smoke_result = run_smoke_test() smoke_dict = smoke_result.to_dict() @@ -119,9 +241,11 @@ def final_test( findings_desc = "\n".join( f"- [{f.severity}] {f.message}" for f in actionable ) - # Build pytest command for the repair agent + # Build the language-appropriate verify command for the agent venv_python = get_dev_python(repo_path) or "python3" - repair_pytest_cmd = build_batch_pytest_cmd([], venv_python) + repair_verify_cmd = _build_backend_test_cmd( + backend, repo_path, [], venv_python, + ) repair_prompt = ( "The smoke test detected the following issues after all " "unit tests passed. Fix each issue in the production code, " @@ -134,7 +258,7 @@ def final_test( "- Startup crash → fix initialization code\n\n" "Do NOT create new test files. Only fix production code.\n" "After fixing, run this command to verify:\n" - f"```\n{repair_pytest_cmd}\n```\n\n" + f"```\n{repair_verify_cmd}\n```\n\n" "When done, commit your changes:\n" "```\ngit add -A && git commit -m " '"fix: repair smoke test findings"\n```\n' @@ -150,13 +274,14 @@ def final_test( ) if response: # Verify repair didn't break existing tests - recheck = run_pytest( + recheck = run_project_tests( repo_path, timeout=DEFAULT_PYTEST_OVERALL_TIMEOUT, extra_args=[ "-v", "--tb=short", f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread", ], + backend=backend, ) if not recheck.success: logger.warning( diff --git a/CoderMind/scripts/code_gen/git_ops.py b/CoderMind/scripts/code_gen/git_ops.py index 181556d..06491c9 100644 --- a/CoderMind/scripts/code_gen/git_ops.py +++ b/CoderMind/scripts/code_gen/git_ops.py @@ -20,7 +20,7 @@ from pathlib import Path from typing import List, Optional, Tuple -from common.git_utils import GitRunner +from common.git_utils import GitRunner, sanitize_branch_component logger = logging.getLogger(__name__) @@ -70,7 +70,7 @@ def setup_batch_branch( """ ensure_on_main(git) - safe_id = batch_id.replace("/", "_").replace("\\", "_")[:50] + safe_id = sanitize_branch_component(batch_id, max_len=50, fallback="batch") branch_name = f"batch/{safe_id}" if git.branch_exists(branch_name): diff --git a/CoderMind/scripts/code_gen/global_review.py b/CoderMind/scripts/code_gen/global_review.py index 4489cb7..77023e9 100644 --- a/CoderMind/scripts/code_gen/global_review.py +++ b/CoderMind/scripts/code_gen/global_review.py @@ -33,7 +33,7 @@ REPO_DIR, TOOLS_DIR, ) -from code_gen.batch_prompts import build_batch_pytest_cmd +from code_gen.batch_prompts import _build_backend_test_cmd from code_gen.stage_io import ( save_stage_result as _save_stage_result, load_stage_result as _load_stage_result, @@ -42,7 +42,8 @@ from code_gen.test_runner import ( ensure_deps_installed, get_dev_python, - run_pytest, + resolve_test_backend, + run_project_tests, ) logger = logging.getLogger(__name__) @@ -652,8 +653,8 @@ def _collect_children(children: list, depth: int = 1, max_depth: int = 3) -> Lis def _load_gui_script_reuse_context(repo_path: Path) -> str: """Load reusable GUI interaction scripts for review prompt context. - Scripts are stored under ``repo/.cmind/tmp/gui_test_scripts`` and are - intended to capture stable, previously-validated interaction flows. + Scripts are stored under ``repo/.cmind/tmp/gui_test_scripts`` and + capture stable interaction flows that the review prompt can reuse. """ scripts_dir = repo_path / ".cmind" / "tmp" / "gui_test_scripts" if not scripts_dir.is_dir(): @@ -714,7 +715,8 @@ def _build_review_prompt(repo_path: Path, previous_issues: str = "") -> str: file_list = "(file listing unavailable)" venv_python = get_dev_python(repo_path) or "python3" - pytest_cmd = build_batch_pytest_cmd([], venv_python) + backend = resolve_test_backend(repo_path=repo_path) + pytest_cmd = _build_backend_test_cmd(backend, repo_path, [], venv_python) gui_script_reuse_context = _load_gui_script_reuse_context(repo_path) # Load accumulated findings from all pipeline stages @@ -1115,6 +1117,7 @@ def global_review( } start_time = time.time() previous_issues = "" + backend = resolve_test_backend(repo_path=repo_path) for iteration in range(1, max_iterations + 1): logger.info("━━━ Global Review: iteration %d/%d ━━━", iteration, max_iterations) @@ -1127,15 +1130,17 @@ def global_review( except Exception: pass - # 1. Pre-check: run pytest to know current state - try: - ensure_deps_installed(repo_path) - except Exception: - pass - pre_pytest = run_pytest( + # 1. Pre-check: run the project's tests to know current state + if backend.name == "python": + try: + ensure_deps_installed(repo_path) + except Exception: + pass + pre_pytest = run_project_tests( repo_path, timeout=DEFAULT_PYTEST_OVERALL_TIMEOUT, extra_args=[f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread"], + backend=backend, ) # Update stage file so _build_review_prompt sees fresh state _save_stage_result("final_test", { @@ -1192,10 +1197,11 @@ def global_review( # 5. Post-verify (independent — don't trust sub-agent) _cleanup_background_processes(repo_path) - post_pytest = run_pytest( + post_pytest = run_project_tests( repo_path, timeout=DEFAULT_PYTEST_OVERALL_TIMEOUT, extra_args=["-v", "--tb=short", f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread"], + backend=backend, ) # Stub check diff --git a/CoderMind/scripts/code_gen/post_verify.py b/CoderMind/scripts/code_gen/post_verify.py index a960a31..0e448a2 100644 --- a/CoderMind/scripts/code_gen/post_verify.py +++ b/CoderMind/scripts/code_gen/post_verify.py @@ -30,7 +30,8 @@ from code_gen.test_runner import ( ensure_deps_installed, find_related_test_files, - run_pytest, + resolve_test_backend, + run_project_tests, ) logger = logging.getLogger(__name__) @@ -115,17 +116,19 @@ def _git_diff_test_files(prefix: str = "tests/") -> list: test_files if test_files else "all tests", ) - # Ensure deps are installed (sub-agent may have added new ones) - try: - ensure_deps_installed(repo_path) - except Exception as exc: - logger.warning("ensure_deps_installed failed: %s", exc) + backend = resolve_test_backend(valid_files=test_files or None, repo_path=repo_path) + if backend.name == "python": + try: + ensure_deps_installed(repo_path) + except Exception as exc: + logger.warning("ensure_deps_installed failed: %s", exc) - result = run_pytest( + result = run_project_tests( repo_path, test_files=test_files or None, timeout=timeout, extra_args=[f"--timeout={DEFAULT_TEST_TIMEOUT}", "--timeout-method=thread"], + backend=backend, ) # Build summary diff --git a/CoderMind/scripts/code_gen/prompts.py b/CoderMind/scripts/code_gen/prompts.py index 0194f19..9a2d603 100644 --- a/CoderMind/scripts/code_gen/prompts.py +++ b/CoderMind/scripts/code_gen/prompts.py @@ -18,7 +18,30 @@ # Ensure scripts dir is on path for common.paths import _sys.path.insert(0, str(_Path(__file__).resolve().parent.parent)) from common.paths import REPO_DIR as _REPO_DIR -import ast as _ast_mod + + +_FENCE_BY_SUFFIX = { + ".py": "python", + ".go": "go", + ".rs": "rust", + ".ts": "typescript", + ".tsx": "typescript", + ".js": "javascript", + ".jsx": "javascript", + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".cxx": "cpp", + ".hpp": "cpp", + ".hh": "cpp", + ".hxx": "cpp", +} + + +def _markdown_fence_for_path(file_path: str) -> str: + """Return a markdown code fence language for ``file_path``.""" + return _FENCE_BY_SUFFIX.get(_Path(file_path).suffix.lower(), "text") # ============================================================================ @@ -69,7 +92,7 @@ def _format_skeleton_context(file_path: str) -> str: "signatures, docstrings, type hints) from the design stage. **Your tests MUST\n" "target ONLY the classes, methods, and signatures defined below.** Do NOT invent\n" "methods or features that are not present in this skeleton.\n\n" - f"```python\n{content}\n```\n" + f"```{_markdown_fence_for_path(file_path)}\n{content}\n```\n" ) @@ -89,7 +112,7 @@ def _format_current_source_context(file_path: str) -> str: "previous step. **Your tests MUST match the real API** (class names, method names,\n" "signatures, return types) as shown below. Fix any tests that expect methods or\n" "behaviors not present in this implementation.\n\n" - f"```python\n{content}\n```\n" + f"```{_markdown_fence_for_path(file_path)}\n{content}\n```\n" ) @@ -142,20 +165,33 @@ def _format_dependency_context(ctx: Optional[Dict[str, Any]]) -> str: subs = bc.get("subclasses", {}) if not code: continue - # Extract class name and method names from code - try: - tree = _ast_mod.parse(code) - for node in _ast_mod.walk(tree): - if isinstance(node, _ast_mod.ClassDef): - methods = [n.name for n in node.body - if isinstance(n, (_ast_mod.FunctionDef, _ast_mod.AsyncFunctionDef))] - parts.append(f"- `{node.name}` in `{fp}` — methods: {', '.join(methods)}") - if subs: - for parent, children in subs.items(): - if parent == node.name: - parts.append(f" Subclasses: {', '.join(children)}") - break - except SyntaxError: + # Extract class and method names through the target-language + # backend resolved from the file's extension (defaults to + # Python). Syntax errors yield an empty unit list, so malformed + # base class snippets simply contribute no class summary here. + from decoder_lang import get_backend as _get_backend + from lang_parser import detect_language as _detect_language + backend = _get_backend(_detect_language(fp) or "python") + class_like = {"class", "struct", "interface", "type", "enum"} + units = backend.list_code_units(code, fp) + classes = [ + u for u in units + if u.unit_type in class_like and u.parent is None + ] + if classes: + first_class = classes[0] + methods = [ + u.name for u in units + if u.unit_type == "method" and u.parent == first_class.name + ] + parts.append( + f"- `{first_class.name}` in `{fp}` — methods: {', '.join(methods)}" + ) + if subs: + for parent, children in subs.items(): + if parent == first_class.name: + parts.append(f" Subclasses: {', '.join(children)}") + else: parts.append(f"- `{fp}` (parse error — read file directly)") parts.append("") @@ -330,8 +366,8 @@ def init_test_gen_prompt( "- Keep tests deterministic, readable, and maintainable.\n" "- If the expected behavior is unclear, encode the most reasonable interpretation\n" " and add comments explaining your assumptions.\n" - "- **Only import packages available in the environment.** Use Python standard library\n" - " and internal project modules (`src.*`) freely. For third-party packages, only import\n" + "- **Only import packages available in the environment.** Use the target language's standard library\n" + " and internal project modules freely. For third-party packages, only import\n" " them if they are already used by existing source files. Never add unused imports.\n" "- **CRITICAL: Only test classes, methods, and functions that exist in the skeleton\n" " file below (if provided). Do NOT invent or assume additional methods, features,\n" @@ -467,8 +503,8 @@ def init_code_gen_prompt( " prefer the same modules and import style (to stay consistent with the codebase).\n" "- If you introduce new symbols in this file, also add or update the import statements so that the module can be\n" " imported and executed without NameError or ImportError.\n" - "- **Only import packages available in the environment.** Use Python standard library\n" - " and internal project modules (`src.*`) freely. For third-party packages, only import\n" + "- **Only import packages available in the environment.** Use the target language's standard library\n" + " and internal project modules freely. For third-party packages, only import\n" " them if they are already used by existing source files. Before adding any import,\n" " verify you actually USE the imported name in your code — never add unused imports.\n" "\n**Plan first — output a brief summary** (3–5 sentences) before writing any code:\n" @@ -493,7 +529,7 @@ def init_code_gen_prompt( "- Fix only what is needed to make integration tests pass.\n" "- Read the actual source files to understand current implementation before changing.\n" "- Do NOT refactor working code. Only fix broken connections.\n" - "- Do NOT create main.py \u2014 it will be created in a later task.\n" + "- Do NOT create the project entry point \u2014 it will be created in a later task.\n" "- Do NOT edit test files at this stage.\n" ) elif task_type == "final_test_docs": @@ -513,7 +549,7 @@ def init_code_gen_prompt( "- Fix only what is needed to make end-to-end tests pass.\n" "- Read the actual source files to understand current implementation before changing.\n" "- Do NOT refactor working code. Only fix broken connections.\n" - "- Do NOT create main.py \u2014 it will be created in the next task.\n" + "- Do NOT create the project entry point \u2014 it will be created in the next task.\n" "- Do NOT edit test files at this stage.\n" ) else: @@ -730,7 +766,7 @@ def init_project_file_gen_prompt( """Generate prompt for project file generation. This is used after all core implementation is complete. - Project files include: requirements.txt, README.md, main.py, etc. + Project files include dependency manifests, README.md, entry points, etc. Args: task: Task description with detailed instructions @@ -782,11 +818,11 @@ def build_project_file_prompt_from_batch( def is_project_file_batch(batch: "PlannedTask") -> bool: - """Check if a batch is for project file generation (requirements, docs, main entry).""" + """Check if a batch is for project file generation.""" return batch.task_type in [ - "project_requirements", # requirements.txt (needs import test) + "project_requirements", # language dependency metadata "project_docs", # README.md (no tests) - "main_entry", # main.py (needs run test) + "main_entry", # language entry point (needs run test) ] @@ -798,7 +834,7 @@ def is_project_docs_batch(batch: "PlannedTask") -> bool: def needs_project_file_test(batch: "PlannedTask") -> bool: """Check if a project file batch needs testing.""" return batch.task_type in [ - "project_requirements", # import validation + "project_requirements", # dependency/import validation "main_entry", # run test ] @@ -883,9 +919,9 @@ def env_fix_prompt( f"{test_result}\n\n" "Guidelines:\n" "- Fix by REMOVING the unused import if the imported name is not actually used in the code,\n" - " OR by replacing the third-party functionality with Python standard library equivalents.\n" + " OR by replacing the third-party functionality with target-language standard library equivalents.\n" "- Search the source file for actual usage of the imported name before deciding.\n" - "- Do NOT attempt to install packages or modify requirements.txt.\n" + "- Do NOT attempt to install packages or modify dependency manifests.\n" "- Do NOT modify test files.\n" "- Prefer minimal, targeted changes.\n" "- Logical test failures may remain — that is acceptable.\n" diff --git a/CoderMind/scripts/code_gen/rpg_updater.py b/CoderMind/scripts/code_gen/rpg_updater.py index cad8475..3d4c1c9 100644 --- a/CoderMind/scripts/code_gen/rpg_updater.py +++ b/CoderMind/scripts/code_gen/rpg_updater.py @@ -116,6 +116,12 @@ def analyze_file(self, file_path: Path, code: str) -> None: file_path: Path to the file code: Source code content """ + # Python-AST based: dependencies for other languages come from the + # tree-sitter dependency graph. Skip non-Python sources quietly rather + # than emitting a misleading "SyntaxError parsing" warning on every + # generated .rs/.js/.go/... file. + if Path(file_path).suffix.lower() != ".py": + return try: tree = ast.parse(code) except SyntaxError as e: diff --git a/CoderMind/scripts/code_gen/stage_io.py b/CoderMind/scripts/code_gen/stage_io.py index 7dd25d3..2065223 100644 --- a/CoderMind/scripts/code_gen/stage_io.py +++ b/CoderMind/scripts/code_gen/stage_io.py @@ -35,12 +35,18 @@ def save_stage_result(name: str, data: Dict[str, Any]) -> None: Each pipeline stage (final_test, smoke_test, global_review) saves its output independently. Global review loads all of them as context. + + Uses :func:`common.rpg_io.atomic_write_rpg` so a killed codegen run + can't leave a half-truncated sidecar that ``global_review`` would + then try (and fail) to load. ``default=str`` is forwarded through + ``**dump_kwargs`` to preserve the original fall-back serialiser for + non-JSON-native objects (e.g. ``Path``, datetimes). """ LOGS_DIR.mkdir(parents=True, exist_ok=True) dest = stage_path(name) try: - with open(dest, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2, default=str) + from common.rpg_io import atomic_write_rpg + atomic_write_rpg(dest, data, indent=2, default=str) logger.info("Saved stage result: %s", dest) except Exception as exc: logger.debug("Failed to save stage result %s: %s", name, exc) diff --git a/CoderMind/scripts/code_gen/static_checks.py b/CoderMind/scripts/code_gen/static_checks.py index 3badf39..ddb54e9 100644 --- a/CoderMind/scripts/code_gen/static_checks.py +++ b/CoderMind/scripts/code_gen/static_checks.py @@ -6,13 +6,40 @@ """ import ast +import json import logging from pathlib import Path -from typing import List +from typing import Any, List + +from common.paths import FEATURE_SPEC_FILE, REPO_RPG_FILE +from decoder_lang import LanguageBackend, get_backend, resolve_decoder_language logger = logging.getLogger(__name__) +def _load_json_if_exists(path: Path) -> Any: + """Load JSON from ``path`` or return None when unavailable.""" + if not path.exists(): + return None + try: + with open(path, "r", encoding="utf-8") as file: + return json.load(file) + except (OSError, json.JSONDecodeError): + return None + + +def _resolve_static_backend(files: List[str]) -> LanguageBackend: + """Resolve the backend used for static codegen completeness checks.""" + feature_spec = _load_json_if_exists(FEATURE_SPEC_FILE) + rpg_obj = _load_json_if_exists(REPO_RPG_FILE) + language = resolve_decoder_language( + feature_spec=feature_spec, + rpg_obj=rpg_obj, + valid_files=files, + ) + return get_backend(language) + + def static_completeness_check(files: List[str], repo_path: Path) -> List[str]: """Project-type-agnostic static completeness check. @@ -30,6 +57,7 @@ def static_completeness_check(files: List[str], repo_path: Path) -> List[str]: List of human-readable issue strings (empty = all clean). """ issues: List[str] = [] + backend = _resolve_static_backend(files) for filepath in files: full_path = repo_path / filepath @@ -37,13 +65,26 @@ def static_completeness_check(files: List[str], repo_path: Path) -> List[str]: issues.append(f"MISSING: {filepath} does not exist") continue - if full_path.suffix != ".py": + if not backend.is_source_file(filepath): continue try: content = full_path.read_text(encoding="utf-8") + except UnicodeDecodeError as exc: + issues.append(f"PARSE_ERROR: {filepath} — {exc}") + continue + + if backend.name != "python": + ok, error = backend.syntax_check(content, filepath) + if not ok: + issues.append(f"PARSE_ERROR: {filepath} — {error}") + if backend.has_placeholder(content, filepath): + issues.append(f"PLACEHOLDER: {filepath} contains placeholder code") + continue + + try: tree = ast.parse(content, filename=filepath) - except (SyntaxError, UnicodeDecodeError) as exc: + except SyntaxError as exc: issues.append(f"PARSE_ERROR: {filepath} — {exc}") continue @@ -166,5 +207,3 @@ def _is_abstract_method(func_node) -> bool: ) return issues - - return issues diff --git a/CoderMind/scripts/code_gen/subtree_review.py b/CoderMind/scripts/code_gen/subtree_review.py index c9ddc1c..45b2663 100644 --- a/CoderMind/scripts/code_gen/subtree_review.py +++ b/CoderMind/scripts/code_gen/subtree_review.py @@ -91,9 +91,11 @@ def _needs_llm_review(subtree_files: List[str], repo_path: Path) -> bool: Returns: True if LLM review is recommended, False if safe to skip. """ + from lang_parser import is_supported_source + for filepath in subtree_files: full_path = repo_path / filepath - if not full_path.exists() or full_path.suffix != '.py': + if not full_path.exists() or not is_supported_source(filepath): continue try: content = full_path.read_text(encoding='utf-8', errors='replace') @@ -227,7 +229,7 @@ def is_subtree_just_completed( {skeleton_only_files} ## Test Command -{pytest_cmd} +{test_cmd} ## Output Last line MUST be one of: @@ -246,7 +248,7 @@ def _build_review_prompt( tasks_path: Path, repo_path: Path, project_background: str = "", - pytest_cmd: str = "", + test_cmd: str = "", ) -> str: """Construct the review prompt for an LLM sub-agent.""" all_tasks = load_tasks_from_tasks_json(tasks_path) @@ -283,7 +285,7 @@ def _build_review_prompt( static_check_results=static_check_results or "All static checks passed.", completed_modules_from_other_subtrees=other_list, skeleton_only_files=skel_list, - pytest_cmd=pytest_cmd or "python3 -m pytest tests/ -x --tb=short -q --timeout=30", + test_cmd=test_cmd or "the project's native test command", ) # Append cross-subtree connection check if there are completed dependencies @@ -428,14 +430,12 @@ def run_subtree_review( ) return result - # 3. Build pytest command - from code_gen.test_runner import get_dev_python + # 3. Build the target language's native test command for the review prompt + from code_gen.test_runner import get_dev_python, resolve_test_backend + from code_gen.batch_prompts import _build_backend_test_cmd venv_python = get_dev_python(repo_path) or "python3" - pytest_cmd = ( - f"{venv_python} -m pytest tests/ -x --tb=short -q " - f"--timeout=30 --timeout-method=signal " - f"-W ignore::DeprecationWarning" - ) + backend = resolve_test_backend(valid_files=subtree_files) + test_cmd = _build_backend_test_cmd(backend, repo_path, [], venv_python) # 4. Build review prompt prompt = _build_review_prompt( @@ -447,14 +447,14 @@ def run_subtree_review( tasks_path=tasks_path, repo_path=repo_path, project_background=project_background, - pytest_cmd=pytest_cmd, + test_cmd=test_cmd, ) # 5. Setup review branch - from common.git_utils import GitRunner + from common.git_utils import GitRunner, sanitize_branch_component git = GitRunner(str(repo_path)) - safe_name = subtree_name.lower().replace(" ", "_").replace("/", "_")[:40] + safe_name = sanitize_branch_component(subtree_name.lower(), max_len=40, fallback="review") branch_name = f"review/{safe_name}" # Ensure on main first @@ -496,15 +496,17 @@ def run_subtree_review( # 8. Post-verify if review made changes if result.status in ("FIXED", "ALL_COMPLETE"): # Run pytest to verify no regressions - from code_gen.test_runner import run_pytest, ensure_deps_installed - try: - ensure_deps_installed(repo_path) - except Exception: - pass - verify_result = run_pytest( + from code_gen.test_runner import run_project_tests, ensure_deps_installed + if backend.name == "python": + try: + ensure_deps_installed(repo_path) + except Exception: + pass + verify_result = run_project_tests( repo_path, timeout=180, extra_args=["--timeout=30", "--timeout-method=signal"], + backend=backend, ) verify_passed = verify_result.success if verify_passed: diff --git a/CoderMind/scripts/code_gen/task_loader.py b/CoderMind/scripts/code_gen/task_loader.py index ffa63e6..33a20bb 100644 --- a/CoderMind/scripts/code_gen/task_loader.py +++ b/CoderMind/scripts/code_gen/task_loader.py @@ -1,15 +1,14 @@ #!/usr/bin/env python3 """Task selection helpers for the codegen batch orchestrator. -This module hosts the two task-picker helpers that were originally -defined in the now-deleted top-level ``prepare_batch.py``: +This module hosts the two task-picker helpers used by +``scripts.run_batch``: * :func:`get_next_pending_task_id` — pick the next single task to run, with git-based auto-recovery and integration-test deferral. * :func:`get_next_merged_tasks` — pick a same-file group of pending implementation tasks for "file-merge" mode batches. -Both are consumed by ``scripts.run_batch``'s Module 5 orchestrator. They share three private helpers — ``_git_grep_pattern``, ``_git_has_gen_code_commit``, ``_has_failed_impl_dependencies`` — kept local to this module since they have no callers elsewhere. diff --git a/CoderMind/scripts/code_gen/test_runner.py b/CoderMind/scripts/code_gen/test_runner.py index 546863d..b4490a5 100644 --- a/CoderMind/scripts/code_gen/test_runner.py +++ b/CoderMind/scripts/code_gen/test_runner.py @@ -13,7 +13,6 @@ import signal import subprocess import sys -import ast import shutil import importlib.util import logging @@ -25,6 +24,15 @@ from common.llm_client import LLMClient import json as _json from common.import_normalizer import normalize_files +from common.paths import FEATURE_SPEC_FILE, REPO_RPG_FILE +from decoder_lang import ( + EnvHandle, + LanguageBackend, + ToolchainUnavailable, + get_backend, + resolve_decoder_language, + scan_repo_source_files, +) def _set_pdeathsig() -> None: @@ -407,6 +415,142 @@ def run_pytest( ) +def _load_json_if_exists(path: Path) -> Any: + """Load JSON from ``path`` or return None when unavailable.""" + if not path.exists(): + return None + try: + with open(path, "r", encoding="utf-8") as file: + return _json.load(file) + except (OSError, _json.JSONDecodeError): + return None + + +def resolve_test_backend( + valid_files: Optional[List[str]] = None, + repo_path: Optional[Path] = None, +) -> LanguageBackend: + """Resolve the backend that should run codegen verification tests. + + Language is resolved through :func:`resolve_decoder_language`'s tier + chain (feature_spec meta -> rpg meta -> dominant language of the + supplied files -> python default). When the caller has no scoped + ``valid_files`` (e.g. the final-test / global-review / env-setup + stages operate over the whole repo), pass ``repo_path`` so the + language can still be inferred from the actual on-disk sources rather + than silently defaulting to python for a non-python project. + """ + feature_spec = _load_json_if_exists(FEATURE_SPEC_FILE) + rpg_obj = _load_json_if_exists(REPO_RPG_FILE) + if not valid_files and repo_path is not None: + valid_files = scan_repo_source_files(repo_path) or None + language = resolve_decoder_language( + feature_spec=feature_spec, + rpg_obj=rpg_obj, + valid_files=valid_files, + ) + return get_backend(language) + + +def run_project_tests( + repo_root: Path, + test_files: Optional[List[str]] = None, + timeout: int = 300, + extra_args: Optional[List[str]] = None, + env: Optional[Dict[str, str]] = None, + backend: Optional[LanguageBackend] = None, +) -> TestResult: + """Run the target language's native project test command.""" + selected_backend = backend or resolve_test_backend( + valid_files=test_files, repo_path=repo_root + ) + if selected_backend.name == "python": + return run_pytest( + repo_root, + test_files=test_files, + timeout=timeout, + extra_args=extra_args, + env=env, + ) + + try: + env_handle = selected_backend.detect_env(repo_root) or EnvHandle( + project_root=repo_root.resolve(), + ) + # Settle the build state before testing (no-op for most backends; + # C/C++ reconfigure cmake so ctest sees the current test set rather + # than a stale one left from a mid-edit configure). + prepare = getattr(selected_backend, "prepare_test_env", None) + if callable(prepare): + try: + prepare(env_handle) + except Exception as exc: # noqa: BLE001 - best-effort prep + _logger.debug("prepare_test_env failed (non-fatal): %s", exc) + cmd = selected_backend.test_command(env_handle) + except (ToolchainUnavailable, NotImplementedError, OSError) as exc: + return TestResult( + success=False, + return_code=-1, + output=f"{selected_backend.display_name} test command unavailable: {exc}", + test_files=test_files or [], + ) + + run_env = os.environ.copy() + if env: + run_env.update(env) + + try: + proc = subprocess.Popen( + cmd, + cwd=repo_root, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=run_env, + start_new_session=True, + preexec_fn=_set_pdeathsig, + ) + try: + stdout_data, stderr_data = proc.communicate(timeout=timeout) + except BaseException: + try: + os.killpg(os.getpgid(proc.pid), signal.SIGTERM) + except Exception: + proc.kill() + proc.wait() + raise + + output = stdout_data + if stderr_data: + output += "\n\nSTDERR:\n" + stderr_data + parsed = selected_backend.parse_test_output(output, proc.returncode) + return TestResult( + success=parsed.status == "passed", + return_code=proc.returncode, + output=output, + test_files=test_files or [], + passed=parsed.passed_count, + failed=parsed.failed_count, + errors=parsed.error_count, + skipped=parsed.skipped_count, + duration=parsed.duration_sec, + ) + except subprocess.TimeoutExpired: + return TestResult( + success=False, + return_code=-1, + output=f"Test execution timed out after {timeout} seconds", + test_files=test_files or [], + ) + except Exception as exc: + return TestResult( + success=False, + return_code=-1, + output=f"Test execution failed: {exc}", + test_files=test_files or [], + ) + + def parse_pytest_output(output: str) -> Dict[str, Any]: """Parse pytest output to extract statistics. @@ -798,7 +942,13 @@ def scan_missing_imports(repo_root: Path) -> List[str]: if child.is_dir() and not child.name.startswith('.'): project_modules.add(child.name) - # Collect all external imports from source files + # Collect all external imports from source files through the + # backend import scanner. ``LPDependency.extra["module"]`` carries + # the dotted module name for both ``import`` and ``from`` statements; + # use the top-level segment for dependency installation. + from decoder_lang import get_backend + backend = get_backend("python") + external_imports: Set[str] = set() scan_dirs = [d for d in [src_dir, tests_dir] if d.is_dir()] @@ -808,19 +958,16 @@ def scan_missing_imports(repo_root: Path) -> List[str]: continue try: source = py_file.read_text(encoding='utf-8') - tree = ast.parse(source) - except (SyntaxError, UnicodeDecodeError): + except (OSError, UnicodeDecodeError): continue - for node in ast.walk(tree): - mod_name = None - if isinstance(node, ast.Import): - for alias in node.names: - mod_name = alias.name.split('.')[0] - elif isinstance(node, ast.ImportFrom): - if node.module and node.level == 0: - mod_name = node.module.split('.')[0] - if mod_name is None: + for dep in backend.list_imports(source, str(py_file)): + extra = dep.extra or {} + module = extra.get("module") or "" + if not module or module.startswith("."): + # Skip relative imports; they refer to project-local + # modules rather than installable third-party packages. continue + mod_name = module.split(".")[0] if mod_name in _STDLIB_TOP_LEVEL or mod_name in project_modules: continue external_imports.add(mod_name) diff --git a/CoderMind/scripts/common/git_utils.py b/CoderMind/scripts/common/git_utils.py index 172b6b2..3c72866 100644 --- a/CoderMind/scripts/common/git_utils.py +++ b/CoderMind/scripts/common/git_utils.py @@ -10,12 +10,56 @@ """ import logging +import re import subprocess from pathlib import Path from typing import Optional, Tuple, List, Dict from dataclasses import dataclass +_INVALID_REF_CHARS = re.compile(r"[^A-Za-z0-9._-]+") + + +def sanitize_branch_component( + component: str, + max_len: int = 50, + fallback: str = "x", +) -> str: + """Normalize a dynamic string into a git-safe branch path component. + + The result is safe to embed as ``/`` in a branch name. + It replaces characters git rejects in refs (spaces, ``~^:?*[`` and + backslash, control chars) with ``_``, collapses ``..`` and repeated + separators, strips leading/trailing separators, caps length, and avoids a + trailing ``.`` or ``.lock`` suffix. Always returns a non-empty token so + callers can build a valid ref for any language's task identifiers. + + Args: + component: Raw dynamic text (task id, subtree name, ...). + max_len: Maximum length of the returned component. + fallback: Token returned when sanitization yields an empty string. + + Returns: + A git-ref-safe, non-empty component string. + """ + raw = (component or "").strip() + if not raw: + return fallback + + safe = _INVALID_REF_CHARS.sub("_", raw.replace("\\", "/").replace("/", "_")) + safe = safe.replace("..", "_") + safe = re.sub(r"[._-]{2,}", "_", safe) + safe = safe.strip("._-") + if not safe: + return fallback + + safe = safe[:max_len].rstrip("._-") + if safe.endswith(".lock"): + safe = safe[: -len(".lock")].rstrip("._-") + + return safe or fallback + + @dataclass class GitResult: """Result of a Git command execution.""" @@ -769,7 +813,7 @@ def create_task_branch( git = GitRunner(repo_path) # Create sanitized branch name - safe_id = batch_id.replace("/", "_").replace("\\", "_")[:50] + safe_id = sanitize_branch_component(batch_id, max_len=50, fallback="task") branch_name = f"task/{safe_id}" # Handle uncommitted changes diff --git a/CoderMind/scripts/common/language_meta.py b/CoderMind/scripts/common/language_meta.py new file mode 100644 index 0000000..88faafd --- /dev/null +++ b/CoderMind/scripts/common/language_meta.py @@ -0,0 +1,79 @@ +"""Helpers for language metadata in generated decoder artifacts.""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + + +PRIMARY_LANGUAGE_FIELD = "primary_language" +TARGET_LANGUAGES_FIELD = "target_languages" + +_LANGUAGE_ALIASES = { + "c++": "cpp", + "cplusplus": "cpp", + "cc": "cpp", + "js": "javascript", + "jsx": "javascript", + "ts": "typescript", + "tsx": "typescript", +} + + +def canonical_language_name(value: str) -> str: + """Return the decoder's canonical language key for a raw name.""" + cleaned = value.strip().lower() + return _LANGUAGE_ALIASES.get(cleaned, cleaned) + + +def normalize_language_metadata( + primary: Any = None, + languages: Any = None, +) -> tuple[str | None, list[str]]: + """Returns a primary language and normalized ordered language list.""" + normalized: list[str] = [] + if isinstance(languages, list): + for language in languages: + if isinstance(language, str): + cleaned = canonical_language_name(language) + if cleaned and cleaned not in normalized: + normalized.append(cleaned) + + clean_primary = None + if isinstance(primary, str): + candidate = canonical_language_name(primary) + if candidate: + clean_primary = candidate + + if clean_primary: + if clean_primary in normalized: + normalized.remove(clean_primary) + normalized.insert(0, clean_primary) + elif normalized: + clean_primary = normalized[0] + + return clean_primary, normalized + + +def extract_language_metadata(data: Any) -> tuple[str | None, list[str]]: + """Reads canonical language metadata from ``meta``.""" + meta = _get_value(data, "meta") + primary = _get_value(meta, PRIMARY_LANGUAGE_FIELD) + languages = _get_value(meta, TARGET_LANGUAGES_FIELD) + return normalize_language_metadata(primary, languages) + + +def metadata_with_languages(data: Any, base_meta: Any = None) -> dict[str, Any]: + """Returns a metadata object with canonical language fields.""" + meta = _get_value(data, "meta") if base_meta is None else base_meta + result = dict(meta) if isinstance(meta, Mapping) else {} + primary, languages = extract_language_metadata(data) + result[PRIMARY_LANGUAGE_FIELD] = primary + result[TARGET_LANGUAGES_FIELD] = languages + return result + + +def _get_value(data: Any, key: str) -> Any: + if isinstance(data, Mapping): + return data.get(key) + return getattr(data, key, None) \ No newline at end of file diff --git a/CoderMind/scripts/common/paths.py b/CoderMind/scripts/common/paths.py index f45c38b..bebd0d0 100644 --- a/CoderMind/scripts/common/paths.py +++ b/CoderMind/scripts/common/paths.py @@ -17,7 +17,7 @@ └── workspaces// ├── .meta.toml ← channel, timestamps, version ├── .git/ ← Plan-03 inner snapshot repo - ├── data/ ← rpg.json, dep_graph.json, … + ├── data/ ← rpg.json, feature specs, … │ └── trajectory/ └── logs/ ← *.log, mcp_calls.jsonl, … @@ -263,6 +263,15 @@ def cmd_for(script_relpath: str) -> str: RPG_FILE = DATA_DIR / "rpg.json" REPO_RPG_FILE = RPG_FILE # Unified: both encoder and decoder use rpg.json +# ``DEP_GRAPH_FILE``: legacy standalone dep_graph location. +# As of the embed migration the dep_graph rides inside ``rpg.json`` +# (``RPG.to_dict(include_dep_graph=True)``). New code no longer writes +# this file; the constant stays so legacy workspaces with an existing +# ``dep_graph.json`` continue to load via ``RPGService.load``'s compat +# path, and so a few CLI flags (``--dep-graph`` in update_graphs.py / +# rpg_visualize.py / rpg_edit/apply.py) still resolve a sensible +# default. Safe to remove once those CLIs are pruned in a future +# breaking-change release. DEP_GRAPH_FILE = DATA_DIR / "dep_graph.json" REPO_INFO_FILE = DATA_DIR / "repo_info.json" diff --git a/CoderMind/scripts/common/rpg_io.py b/CoderMind/scripts/common/rpg_io.py index 990ffe4..4221704 100644 --- a/CoderMind/scripts/common/rpg_io.py +++ b/CoderMind/scripts/common/rpg_io.py @@ -66,6 +66,7 @@ def atomic_write_rpg( *, indent: int = 2, ensure_ascii: bool = False, + **dump_kwargs: Any, ) -> None: """Serialise ``data`` to ``path`` atomically as JSON. @@ -75,14 +76,22 @@ def atomic_write_rpg( The signature matches ``json.dump`` for indent / ensure_ascii so callers swapping ``open(path, "w") + json.dump`` for this helper - don't have to rethink their JSON formatting choices. + don't have to rethink their JSON formatting choices. Additional + ``**dump_kwargs`` are forwarded to ``json.dump`` (e.g. ``default=`` + for non-serialisable encoder rounds), letting every legacy caller + migrate without losing custom serialiser hooks. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) tmp = path.with_suffix(path.suffix + ".tmp") try: with open(tmp, "w", encoding="utf-8") as f: - json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii) + json.dump( + data, f, + indent=indent, + ensure_ascii=ensure_ascii, + **dump_kwargs, + ) f.write("\n") # fsync gives us strong durability guarantees: an os.replace # immediately after a crash could otherwise expose the @@ -150,7 +159,7 @@ def safe_load_rpg(path: Path | str) -> Any: # Filenames inside the inner-git repo that we know how to recover. # Mirrors the layout produced by :mod:`cmind_cli._inner_git`: -# ``data/rpg.json``, ``data/dep_graph.json``, etc. +# ``data/rpg.json``, ``data/feature_spec.json``, etc. def _git_relpath_for(path: Path) -> Optional[str]: """Return the path relative to the home-workspace dir for git lookup. diff --git a/CoderMind/scripts/common/utils.py b/CoderMind/scripts/common/utils.py index 360fff9..1dd1874 100644 --- a/CoderMind/scripts/common/utils.py +++ b/CoderMind/scripts/common/utils.py @@ -27,6 +27,51 @@ logger = logging.getLogger(__name__) +# Directory names repository scanners never descend into: version control, +# editor metadata, virtualenvs, dependency installs, and the build-output +# trees of every language the decoder targets (Rust ``target``, CMake build +# dirs, ``build``/``dist``, JS framework caches). Centralized so dep_graph and +# the RPG encoder share one definition instead of each keeping its own list. +# +# Note: any dot-prefixed directory (``.git``, ``.github``, ``.cmind``, +# ``.venv``, ...) is also skipped via :func:`is_skip_dir`; the explicit +# entries below cover the non-dotted build/dependency dirs plus a few common +# dot-dirs kept for readability. +SCAN_SKIP_DIRS = frozenset({ + ".git", ".hg", ".svn", + ".github", ".cmind", + "__pycache__", ".pytest_cache", ".mypy_cache", + ".idea", ".vscode", + ".venv", "venv", "env", + "node_modules", ".next", ".nuxt", + "target", + "build", "dist", + "cmake-build-debug", "cmake-build-release", +}) + + +def is_skip_dir(name: str) -> bool: + """Return True if a directory ``name`` should never be scanned. + + Skips every dot-prefixed directory (``.git``, ``.github``, ``.cmind``, + editor/tooling state, virtualenvs) plus the explicit build/dependency + dirs in :data:`SCAN_SKIP_DIRS`. Centralizes the rule so ``os.walk`` + scanners and the dependency-graph path filter stay consistent. + """ + return name.startswith(".") or name in SCAN_SKIP_DIRS + + +def path_has_skip_dir(path: str) -> bool: + """Return True if any parent directory of ``path`` is a skip dir. + + Used by path-string filters (e.g. the dependency-graph build filter) + that receive a relative path rather than walking a tree. + """ + parts = PurePosixPath(str(path).replace("\\", "/")).parts + # The last part is the file name; only directory parts gate inclusion. + return any(is_skip_dir(part) for part in parts[:-1]) + + # ============================================================================ # Repository Info Functions # ============================================================================ diff --git a/CoderMind/scripts/decoder_lang/__init__.py b/CoderMind/scripts/decoder_lang/__init__.py new file mode 100644 index 0000000..d4e848f --- /dev/null +++ b/CoderMind/scripts/decoder_lang/__init__.py @@ -0,0 +1,90 @@ +"""Decoder language abstraction layer. + +This package introduces a :class:`LanguageBackend` strategy interface +that lets the decoder pipeline (skeleton / func_design / code_gen) +treat the target programming language as a parameter rather than +a hard-coded ``.py`` / ``ast`` / ``pytest`` assumption. + +The registry currently ships :class:`PythonBackend`, :class:`GoBackend`, +:class:`RustBackend`, :class:`TypeScriptBackend`, :class:`CBackend`, and +:class:`CppBackend` implementations. Decoder stages resolve the backend +from explicit feature-spec language, RPG metadata, or source-file +dominant language. + +Public API (see :mod:`decoder_lang.backend` for full signatures): + +* :class:`LanguageBackend` — Protocol every backend implements. +* :class:`PythonBackend` — production backend used by the existing + Python decoder pipeline. +* :func:`get_backend` — registry lookup; falls back to Python with a + single WARNING log when the requested language is unknown. +* :func:`register_backend` — decorator used by backend modules to + self-register on import. +* :class:`PromptHints`, :class:`EnvHandle`, :class:`TestRunResult`, + :class:`TestFailure` — value types passed across the interface. +""" +from __future__ import annotations + +from .backend import ( + LanguageBackend, + ToolchainUnavailable, + default_find_existing_entry, + get_backend, + list_backends, + register_backend, + resolve_decoder_language, + resolve_repo_backend, + resolve_target_language, + scan_repo_source_files, +) +from .c_backend import CBackend +from .cpp_backend import CppBackend +from .go_backend import GoBackend +from .javascript_backend import JavaScriptBackend +from .prompt_directive import language_directive, with_language_directive +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .python_backend import PythonBackend +from .rust_backend import RustBackend +from .test_result import EnvHandle, TestFailure, TestRunResult +from .typescript_backend import TypeScriptBackend + +# Side-effect: register backends on package import so the registry is +# populated even when callers only ``import decoder_lang``. Python is +# the decoder's default; Go provides parser-backed code-structure and +# basic Go toolchain/test-runner behavior. +register_backend(PythonBackend) +register_backend(GoBackend) +register_backend(RustBackend) +register_backend(TypeScriptBackend) +register_backend(JavaScriptBackend) +register_backend(CBackend) +register_backend(CppBackend) + +__all__ = [ + "EnvHandle", + "CBackend", + "CppBackend", + "GoBackend", + "JavaScriptBackend", + "LanguageBackend", + "PromptHints", + "ProjectTaskContext", + "ProjectTaskTemplates", + "PythonBackend", + "RustBackend", + "TestFailure", + "TestRunResult", + "ToolchainUnavailable", + "TypeScriptBackend", + "default_find_existing_entry", + "get_backend", + "language_directive", + "list_backends", + "register_backend", + "resolve_decoder_language", + "resolve_repo_backend", + "resolve_target_language", + "scan_repo_source_files", + "with_language_directive", +] diff --git a/CoderMind/scripts/decoder_lang/backend.py b/CoderMind/scripts/decoder_lang/backend.py new file mode 100644 index 0000000..4d2d044 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/backend.py @@ -0,0 +1,605 @@ +"""Strategy interface every per-language decoder backend implements. + +A backend bundles all language-specific behaviour the decoder needs: + +* file / package layout conventions (extension, package marker file, + identifier rules); +* code-structure extraction (signatures, classes, functions, + placeholder detection) — implementations delegate to + :mod:`lang_parser` so encoder and decoder agree on AST semantics; +* build / test environment handling (venv / go mod / cargo); +* test command + native-output parsing; +* prompt fill values. + +The interface is a runtime-checkable :class:`Protocol`; backends may +either subclass it or just match the structural shape. Implementations +are plain classes that satisfy the protocol so static type-checking and +``isinstance`` both work. Backends can implement a useful subset while +raising :class:`NotImplementedError` for operations a decoder stage does +not call for that language. +""" +from __future__ import annotations + +import logging +from pathlib import Path +from typing import ( + Any, + Iterable, + Protocol, + runtime_checkable, +) + +from common.language_meta import extract_language_metadata + +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .test_result import EnvHandle, TestRunResult + +# Re-exported for convenience of method signatures. Callers that want +# to consume the AST output can import LPCodeUnit / LPDependency +# directly from ``lang_parser`` — backends produce instances of those +# types so the decoder doesn't depend on ``ast``. +try: # pragma: no cover - lang_parser is a peer package + from lang_parser import LPCodeUnit, LPDependency # type: ignore +except ImportError: # pragma: no cover + LPCodeUnit = Any # type: ignore[assignment,misc] + LPDependency = Any # type: ignore[assignment,misc] + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class ToolchainUnavailable(RuntimeError): + """Raised by :meth:`LanguageBackend.ensure_env` when the host machine + lacks the compiler / runtime needed for the language (e.g. no ``go`` + binary on PATH for :class:`GoBackend`). + + Callers in the decoder must translate this into a + :class:`~decoder_lang.test_result.TestRunResult` with + ``status="skipped"`` so the verification step becomes a non-fatal + WARN rather than a crash. The :class:`PythonBackend` never raises + this; if the Python interpreter that imported the decoder is + running, the toolchain is by definition available. + """ + + +# --------------------------------------------------------------------------- +# Backend interface +# --------------------------------------------------------------------------- + + +@runtime_checkable +class LanguageBackend(Protocol): + """Per-language behaviour bundle consumed by the decoder pipeline. + + Implementations are expected to be stateless (call-safe from + multiple threads), cheap to construct, and idempotent. The + registry caches one instance per language; backends MUST NOT + store per-project state on ``self``. + """ + + # --- Identity -------------------------------------------------------- + + name: str # registry key, e.g. "python" + display_name: str # human-readable, e.g. "Python" + file_extension: str # primary source extension, e.g. ".py" + markdown_fence: str # code-fence tag, e.g. "python" + + # --- 1. File & package layout --------------------------------------- + + def is_source_file(self, path: str) -> bool: + """Return True when ``path`` is a source file the decoder should + treat as compilable for this language. Should exclude test + files (callers use :meth:`is_test_file` for those).""" + + def is_test_file(self, path: str) -> bool: + """Return True for test files (e.g. ``tests/test_*.py``, + ``*_test.go``).""" + + def package_marker_filename(self) -> str | None: + """File that signals a directory is a package in this language + (``__init__.py`` for Python; ``None`` for Go / Rust / TypeScript + where directories are packages by virtue of containing source + files).""" + + def package_marker_content(self, pkg_path: str) -> str | None: + """Initial body for the marker file, or ``None`` if the marker + does not exist for this language.""" + + def is_valid_module_identifier(self, segment: str) -> bool: + """Check whether ``segment`` is a legal module / package name + component for this language (used by skeleton path validation).""" + + def sanitize_module_identifier(self, segment: str) -> str: + """Convert an arbitrary string into a legal module name + component, e.g. replacing hyphens with underscores for Python. + Idempotent: ``sanitize(sanitize(s)) == sanitize(s)``.""" + + def entry_point_path(self, module: str) -> str: + """Return the canonical program entry-point file path for this + language (``main.py`` for Python, ``cmd/{module}/main.go`` for + Go, ``src/index.js`` for JavaScript, ``src/main.rs`` for Rust, + ``src/main.cpp`` for C/C++). + + ``module`` is the sanitized project / command name (used by Go's + ``cmd//`` convention; ignored by languages with a fixed + entry path). Consumed by the planner (to avoid generating a + second entry when the skeleton already placed one) and by + ``check_code_gen`` / smoke tests (to locate the entry without a + hard-coded ``main.py``).""" + + def entry_run_command( + self, repo_root: Path, entry: str + ) -> list[str] | None: + """Return the ``subprocess`` argv that runs the entry point in a + *clean* checkout (no PYTHONPATH/path bridging injected), suitable + for a smoke "does it start?" probe — e.g. + ``["python", "main.py", "--help"]``, + ``["go", "run", "./cmd/app"]``, ``["node", "src/index.js"]``, + ``["./calc", "--help"]``, ``["cargo", "run", "--", "--help"]``. + + Returns ``None`` when the language has no meaningful run probe or + the entry cannot be located; callers treat ``None`` as "skip" + (non-fatal), never as failure.""" + + def find_existing_entry(self, interfaces: dict[str, Any]) -> str | None: + """Return the entry-point file the skeleton already placed, if any. + + The skeleton (built by an LLM) may put the program entry at a + non-canonical path — e.g. C++ ``src/cli/main.cpp`` rather than the + backend's default ``src/main.cpp``. The planner calls this before + synthesising the ```` task so it can reuse that file + instead of generating a SECOND entry at the canonical path (the + dual-``main`` bug). Returns the existing path (repo-relative POSIX) + or ``None`` when the skeleton declared no entry, in which case the + backend's canonical :meth:`entry_point_path` is used. + + The default behaviour (see :func:`default_find_existing_entry`) + matches on the canonical entry's filename; backends with a + directory convention (Go's ``cmd//main.go``) override this + to encode the stricter shape.""" + + def entry_point_candidates(self) -> list[str]: + """Return accepted entry-file path patterns for verification. + + ``check_code_gen`` checks that the ```` task produced + a real entry file. A single canonical path is too strict when the + skeleton placed the entry elsewhere (or used a glob-shaped + convention), so backends return every accepted shape here. Entries + may contain ``*`` globs (Go's ``cmd/*/main.go``). The default is + just ``[entry_point_path("")]``.""" + + def prepare_test_env(self, env: EnvHandle) -> None: + """Hook run before the test command to settle the build state. + + No-op for interpreted languages. Compiled languages whose test + set is materialised by a build configurator (C/C++ ``cmake``) + override this to (re)configure the build directory, so the test + runner never observes a stale/partial test set generated against + an earlier source revision. Must be idempotent and tolerate a + missing toolchain (degrade to no-op rather than raise).""" + + # --- 2. Code structure (delegates to lang_parser) ------------------- + + def has_placeholder(self, code: str, path: str = "") -> bool: + """Return True when ``code`` contains a return-placeholder + pattern (TODO / PLACEHOLDER / NOT IMPLEMENTED string return).""" + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + """Return ``(ok, error_message)``. ``ok=True`` means the source + parses cleanly; ``error_message`` is human-readable diagnostic + text when ``ok=False`` (caller surfaces it to the LLM).""" + + def list_code_units( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Return every class / function / method declaration in + ``code`` as :class:`lang_parser.LPCodeUnit` instances (flat + list including nested declarations). + + Backends implement this with whatever parsing infrastructure + they have (stdlib ``ast`` for Python, tree-sitter via + :mod:`lang_parser` for Go / Rust / etc.). On syntax errors, + backends return an empty list rather than raising — callers + already tolerate empty results and rendering the error to the + LLM is the orchestrator's responsibility, not the backend's. + """ + + def format_signature(self, unit: Any) -> str: + """Format a function / method :class:`LPCodeUnit` into a + concise one-line signature, e.g. + ``parse(data: bytes) -> Result``. Backends that can't extract + a precise signature fall back to ``unit.name``.""" + + def list_imports( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Return :class:`lang_parser.LPDependency` records for every + import statement in ``code``. On syntax error returns ``[]``.""" + + def list_inheritance( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Return :class:`lang_parser.LPDependency` records (one per + inheritance edge) for class/type declarations in ``code``. + + Each record carries ``src`` = the deriving type name and + ``symbol`` (and ``dst``) = the base/parent name, so consumers + can resolve inheritance uniformly across languages. Backends + whose parser does not model inheritance (or on syntax error) + return ``[]``; that inheritance is then supplied by the + LLM-declared dependencies instead. + """ + + def unit_kind(self, unit_name: str) -> str: + """Classify an interface unit name as ``"callable"`` / + ``"type"`` / ``"unknown"``. + + ``unit_name`` carries a leading kind token written by the + interface designer (``"function parse"``, ``"method serve"``, + ``"struct Store"``, ``"class Parser"``). Callers use this to + decide whether the orphan heuristic ("no incoming invocation + edge => dead code") applies: it is meaningful for ``"callable"`` + units but produces false positives for ``"type"`` units (a data + structure legitimately has no incoming *invocation* edge). + + Implementations delegate to + :func:`decoder_lang.unit_kind.classify_unit_kind` with their own + prefix sets, so a language with an unusual callable keyword can + override classification without touching call sites.""" + + def is_callable_unit(self, unit_name: str) -> bool: + """Return True when :meth:`unit_kind` is ``"callable"``. + + Convenience wrapper for orphan-detection call sites; type-like + and unrecognised units return False so they are excluded from + the "dead code" heuristic (the false-positive-reducing side).""" + + # --- 3. Build / test environment ------------------------------------ + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + """Return an :class:`EnvHandle` describing an existing + environment (e.g. a ``.venv_dev`` directory for Python, a + ``go.mod`` + ``go`` binary on PATH for Go). Never raises. + Returns ``None`` when no usable env is present.""" + + def ensure_env(self, repo_root: Path) -> EnvHandle: + """Detect-or-create the environment. Idempotent. Raises + :class:`ToolchainUnavailable` when the language toolchain is + missing from the host (compiler / runtime).""" + + def test_command( + self, + env: EnvHandle, + selectors: list[str] | None = None, + ) -> list[str]: + """Return the ``subprocess`` argv to run the project's tests. + ``selectors`` is a backend-specific filter (pytest ``-k`` + expression, ``go test -run`` regex, etc.); backends ignore it + if not supported.""" + + def install_deps_command( + self, + env: EnvHandle, + deps: list[str], + ) -> list[str] | None: + """Return the argv to install third-party dependencies, or + ``None`` when the language manages deps implicitly (Go modules + auto-fetch on build).""" + + # --- 4. Test-output parsing ----------------------------------------- + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + """Parse the native test-tool output (pytest text, ``go test`` + JSON, etc.) into the parser-agnostic + :class:`TestRunResult`. Backends that fail to extract anything + useful must still return a valid result (``raw_output=raw``) + so callers can fall back to LLM-driven analysis.""" + + # --- 5. Prompt hints ------------------------------------------------ + + def prompt_hints(self) -> PromptHints: + """Return the per-language strings the decoder injects into + LLM prompts (see :class:`PromptHints` for the field list).""" + + def project_task_templates( + self, + context: ProjectTaskContext, + ) -> ProjectTaskTemplates | None: + """Return project-level task prompts, or None for planner fallback.""" + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + + +_REGISTRY: dict[str, LanguageBackend] = {} +_DEFAULT_BACKEND_NAME = "python" + + +def register_backend(cls: type[LanguageBackend]) -> type[LanguageBackend]: + """Register a backend class. Instantiates it once and stores the + singleton under its ``name`` attribute. + + Can be used either as a decorator on the class definition or as a + plain function call from package ``__init__``. Re-registering a + backend silently replaces the previous entry (supports test + fixtures that swap in fakes).""" + instance = cls() # type: ignore[call-arg] + if not getattr(instance, "name", None): + raise ValueError(f"backend {cls.__name__} has no .name attribute") + _REGISTRY[instance.name] = instance + logger.debug("Registered decoder backend: %s", instance.name) + return cls + + +def get_backend(language: str | None) -> LanguageBackend: + """Look up a backend by language name. + + Falls back to :class:`PythonBackend` with a single WARNING log + when ``language`` is unrecognised. ``None`` maps to the default + backend without warning because older artefacts may not carry an + explicit language field. + """ + if language is None: + return _REGISTRY[_DEFAULT_BACKEND_NAME] + backend = _REGISTRY.get(language) + if backend is None: + logger.warning( + "No decoder backend for language=%r; falling back to %s", + language, + _DEFAULT_BACKEND_NAME, + ) + return _REGISTRY[_DEFAULT_BACKEND_NAME] + return backend + + +def list_backends() -> list[str]: + """Return the names of all registered backends (test helper).""" + return sorted(_REGISTRY) + + +# --------------------------------------------------------------------------- +# Convenience: resolve project language from RPG / fall back to dominant +# --------------------------------------------------------------------------- + + +def resolve_target_language( + rpg_obj: Any, + valid_files: Iterable[str] | None = None, +) -> str: + """Determine the project's target language using a three-tier chain. + + 1. ``rpg_obj["root"]["meta"]["language"]`` (written by encoder on + :mod:`rpg_encoder.rpg_encoding`). + 2. ``lang_parser.dominant_language(valid_files)`` when the field is + missing (older RPG artefacts) and ``valid_files`` is supplied. + 3. ``"python"`` as a last-resort default, with a WARNING log so + silent fallbacks are visible during debugging. + + The helper lives in this package — rather than in + :mod:`lang_parser` — because the "default to python" rule is + decoder-specific. The encoder never assumes a default language. + """ + # Tier 1: read from RPG root meta + if isinstance(rpg_obj, dict): + root = rpg_obj.get("root") + if isinstance(root, dict): + meta = root.get("meta") + if isinstance(meta, dict): + lang = meta.get("language") + if isinstance(lang, str) and lang: + return lang + + # Tier 2: dominant_language over the provided file list + if valid_files is not None: + try: + # Local import to avoid forcing lang_parser as a hard dep + # of decoder_lang at import time. In practice the decoder + # already imports lang_parser elsewhere, so this is just + # defensive. + from lang_parser import dominant_language # type: ignore + except ImportError: + dominant_language = None # type: ignore[assignment] + if dominant_language is not None: + inferred = dominant_language(valid_files) + if inferred: + return inferred + + # Tier 3: default to python with a single warning so the silent + # fallback path is visible in logs. + logger.warning( + "resolve_target_language: no language on RPG root and no " + "dominant_language fallback; defaulting to 'python'", + ) + return _DEFAULT_BACKEND_NAME + + +def resolve_decoder_language( + feature_spec: Any = None, + rpg_obj: Any = None, + valid_files: Iterable[str] | None = None, +) -> str: + """Determine the target language for a *decoder* stage. + + Extends :func:`resolve_target_language` with a higher-priority + tier 0: explicit language metadata on the loaded ``feature_spec.json``. + Falls through to the same RPG-then-default + chain when the field is absent. + + Tier order: + + 0. ``feature_spec["meta"]["primary_language"]`` or the first + ``feature_spec["meta"]["target_languages"]`` item. + 1. ``rpg_obj["root"]["meta"]["language"]``. + 2. ``lang_parser.dominant_language(valid_files)``. + 3. ``"python"`` default with WARNING. + + Returns the resolved language name; callers feed it directly to + :func:`get_backend`. Never raises. + """ + # Tier 0: explicit override on feature_spec + if feature_spec is not None: + primary, languages = extract_language_metadata(feature_spec) + if primary: + return primary + if languages: + return languages[0] + # Tier 1-3 share the same logic as resolve_target_language. + return resolve_target_language(rpg_obj, valid_files=valid_files) + + +def cmake_reconfigure(env: Any) -> None: + """Reconfigure a CMake build dir so a later ``ctest`` sees a fresh test set. + + The C/C++ test command runs ``ctest`` against a ``build/`` directory + whose registered test set is materialised by ``cmake``. When sources + or ``CMakeLists.txt`` changed since the last configure, ``ctest`` can + observe a STALE / partial test set (the post-verify "ran 1 test" + false-failure that failed an otherwise-green C++ stage). Running + ``cmake -S -B build`` here regenerates the test registration + against the current tree before tests run. + + No-op (silently) when there is no ``CMakeLists.txt`` or no ``cmake`` + on PATH — the project then uses ``make`` / direct compile, which has + no separate configure step. Never raises: a failed reconfigure must + not crash the verification stage (the test command surfaces a real + build error itself). + """ + import shutil + import subprocess + + try: + root = Path(getattr(env, "project_root", ".")) + except Exception: # noqa: BLE001 + return + extra = getattr(env, "extra", {}) or {} + cmake = extra.get("cmake") or shutil.which("cmake") + if not cmake or not (root / "CMakeLists.txt").exists(): + return + try: + subprocess.run( + [cmake, "-S", str(root), "-B", str(root / "build")], + cwd=str(root), + capture_output=True, + timeout=120, + ) + except Exception: # noqa: BLE001 - reconfigure is best-effort + return + + +def default_find_existing_entry( + backend: "LanguageBackend", + interfaces: dict[str, Any], +) -> str | None: + """Filename-match the backend's canonical entry against the skeleton. + + Shared default for :meth:`LanguageBackend.find_existing_entry`. Scans + every designed file path in ``interfaces`` and returns the first whose + basename equals the canonical entry's basename (e.g. ``main.cpp``), so + a skeleton entry placed off the canonical path (``src/cli/main.cpp`` + vs ``src/main.cpp``) is reused instead of duplicated. Returns a + repo-relative POSIX path, or ``None`` when no match exists. Pure / + side-effect-free so backends can call it from their override. + """ + try: + canonical = backend.entry_point_path("") + except Exception: # noqa: BLE001 - defensive; treat as "no canonical" + return None + target_name = canonical.replace("\\", "/").rsplit("/", 1)[-1] + if not target_name or not isinstance(interfaces, dict): + return None + for subtree in interfaces.get("subtrees", {}).values(): + if not isinstance(subtree, dict): + continue + container = subtree.get("interfaces", subtree.get("files", {})) + if not isinstance(container, dict): + continue + for file_path in container: + norm = str(file_path).replace("\\", "/") + if norm.rsplit("/", 1)[-1] == target_name: + return norm + return None + + +def scan_repo_source_files(repo_root: "Path | str") -> list[str]: + """Enumerate supported-language source files under ``repo_root``. + + Language-agnostic on-disk discovery: walks the tree, prunes VCS / + build / dependency directories via :func:`common.utils.is_skip_dir`, + and keeps only files whose extension maps to a registered language + (per :mod:`lang_parser`). Returns repo-relative POSIX path strings. + + This is the robustness layer for language resolution: it lets a + verification stage infer the project language from the real sources + when the encoder metadata (feature_spec / rpg) is missing or + unreadable, so a non-python project is never silently mis-detected as + python. Adding a language requires no change here — the extension set + lives in :mod:`lang_parser`. Never raises; returns ``[]`` when the + path is absent or unreadable. + """ + import os + + try: + from lang_parser import detect_language # type: ignore + except ImportError: + return [] + try: + from common.utils import is_skip_dir + except ImportError: # pragma: no cover - common is always importable here + is_skip_dir = None # type: ignore[assignment] + + root = Path(repo_root) + if not root.is_dir(): + return [] + + found: list[str] = [] + for dirpath, dirnames, filenames in os.walk(root): + if is_skip_dir is not None: + dirnames[:] = [d for d in dirnames if not is_skip_dir(d)] + for name in filenames: + if detect_language(name) is None: + continue + abs_path = Path(dirpath) / name + try: + found.append(abs_path.relative_to(root).as_posix()) + except ValueError: + found.append(name) + return found + + +def resolve_repo_backend( + repo_root: "Path | str", + *, + feature_spec: Any = None, + rpg_obj: Any = None, +) -> "LanguageBackend": + """Resolve the language backend for an on-disk repo (canonical path). + + Wraps :func:`resolve_decoder_language` but *guarantees* the on-disk + scan tier: when neither ``feature_spec`` nor ``rpg_obj`` carries an + explicit language, the project language is inferred from the actual + source files under ``repo_root`` (via :func:`scan_repo_source_files`) + rather than silently defaulting to python. Every post-codegen + verification stage (final test, smoke test, post-verify) routes + through this so a non-python project can never be mis-verified as a + python one. + """ + valid_files = scan_repo_source_files(repo_root) + language = resolve_decoder_language( + feature_spec=feature_spec, + rpg_obj=rpg_obj, + valid_files=valid_files or None, + ) + return get_backend(language) diff --git a/CoderMind/scripts/decoder_lang/c_backend.py b/CoderMind/scripts/decoder_lang/c_backend.py new file mode 100644 index 0000000..6fea512 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/c_backend.py @@ -0,0 +1,353 @@ +"""Production :class:`LanguageBackend` implementation for C.""" +from __future__ import annotations + +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +_C_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") +_C_IDENT_INVALID = re.compile(r"[^A-Za-z0-9_]") +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:TODO|PLACEHOLDER|NOT IMPLEMENTED|abort\s*\(|assert\s*\(\s*0\s*\))" +) +_C_SOURCE_EXTENSIONS = (".c", ".h") +_C_KEYWORDS = frozenset({ + "auto", "break", "case", "char", "const", "continue", "default", + "do", "double", "else", "enum", "extern", "float", "for", "goto", + "if", "inline", "int", "long", "register", "restrict", "return", + "short", "signed", "sizeof", "static", "struct", "switch", "typedef", + "union", "unsigned", "void", "volatile", "while", +}) + + +class CBackend: + """:class:`LanguageBackend` for C source.""" + + name = "c" + display_name = "C" + file_extension = ".c" + markdown_fence = "c" + + def is_source_file(self, path: str) -> bool: + return path.endswith(_C_SOURCE_EXTENSIONS) + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return ( + f"/{normalised}".startswith("/tests/") + or "/tests/" in f"/{normalised}" + or basename.endswith("_test.c") + or basename.startswith("test_") and basename.endswith(".c") + ) + + def package_marker_filename(self) -> str | None: + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + if not segment or segment in _C_KEYWORDS: + return False + return bool(_C_IDENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "_" + cleaned = _C_IDENT_INVALID.sub("_", segment) + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + if cleaned in _C_KEYWORDS: + cleaned = f"{cleaned}_" + return cleaned + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + return ok and bool(_PLACEHOLDER_RE.search(code)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + return self._parser().validate_syntax(self._parse_path(path), code) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [ + unit for unit in result.units + if unit.unit_type in {"struct", "enum", "union", "function"} + ] + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + code = (getattr(unit, "code", "") or "").strip() + if not code: + return getattr(unit, "name", "") or "" + first = code.split("{", 1)[0].split(";", 1)[0].strip() + return " ".join(first.split()) or (getattr(unit, "name", "") or "") + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "src/main.c" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env: EnvHandle) -> None: + from .backend import cmake_reconfigure + + cmake_reconfigure(env) + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + # Compiled CLI: the run probe needs a built binary whose name is + # project-specific, so locating it is left to the smoke layer. + return None + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + cc = self._find_compiler() + make = shutil.which("make") + if not cc and not make: + return None + root = repo_root.resolve() + return EnvHandle( + project_root=root, + runtime_executable=make or cc, + extra={ + "cc": cc, + "make": make, + "makefile": str(root / "Makefile") if (root / "Makefile").exists() else None, + }, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None or not env.extra.get("cc"): + raise ToolchainUnavailable("C compiler is not available on PATH") + return env + + def test_command(self, env: EnvHandle, selectors: list[str] | None = None) -> list[str]: + make = env.extra.get("make") if env.extra else None + makefile = env.project_root / "Makefile" + if make and makefile.exists(): + return [make, "test"] + cc = env.extra.get("cc") if env.extra else None + if not cc: + raise ToolchainUnavailable("C compiler is not available on PATH") + sources = sorted(str(path) for path in env.project_root.rglob("*.c")) + return [ + cc, + "-std=c99", + "-I", + str(env.project_root), + "-Wall", + "-Wextra", + "-fsyntax-only", + *sources, + ] + + def install_deps_command(self, env: EnvHandle, deps: list[str]) -> list[str] | None: + return None + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + # The C test command is ctest/make when a harness exists, else a + # bare ``-fsyntax-only`` compile check that legitimately emits no + # output. So empty output is NOT a no-op here (a clean compile is a + # real signal); only ctest's explicit "no tests" / "out of 0" is. + out_of = re.search(r"out of (\d+)", raw) + observed = int(out_of.group(1)) if out_of else None + if ran_no_tests( + exit_code, raw, observed_tests=observed, + no_tests_markers=("No tests were found",), + empty_output_is_no_op=False, + ): + status = "errored" + else: + status = "passed" if exit_code == 0 else "failed" + failures = [] if status != "failed" else [TestFailure( + test_id="c test", + short_message="C test command failed", + long_message=raw, + )] + fail_match = re.search(r"(\d+)\s+tests?\s+failed", raw) + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=0, + failed_count=( + int(fail_match.group(1)) if fail_match + else (1 if status == "failed" else 0) + ), + error_count=0, + skipped_count=0, + duration_sec=0.0, + failures=failures, + raw_output=raw, + extra={"tool": "make test or compiler syntax check"}, + ) + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = CBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use lowercase snake_case C file names; public declarations live " + "in .h headers and implementations in matching .c files." + ), + package_layout_example=( + "Makefile\n" + "src/\n" + " main.c\n" + " task.c\n" + " task.h\n" + "tests/\n" + " test_task.c\n" + ), + entrypoint_example="src/main.c", + test_framework_name="make test", + style_directive=( + "Write idiomatic C99: explicit ownership rules, checked return " + "values, small header APIs, and no hidden global state unless " + "the requirements explicitly call for it." + ), + ) + CBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + # Reuse the planner-reconciled entry path when provided (avoids a + # second ``main`` file); else fall back to the canonical path. + entry = context.entry_point_path or "src/main.c" + return ProjectTaskTemplates( + dependencies=f"""Generate or update C build metadata for the repository: {context.repo_name} + +**Files to create/update:** +1. `Makefile` - Build, run, and test targets for the C project. + +**Instructions:** +1. Prefer standard C99 and the C standard library. +2. Keep compiler flags strict: `-std=c99 -Wall -Wextra`. +3. Provide `make`, `make test`, and `make clean` targets. +4. Keep generated binaries and test artefacts out of source control. + +**Important:** +- Do NOT create Python dependency files for a C project. +- Do NOT introduce third-party dependencies unless the implemented code requires them. +""", + main_entry=f"""Create the C command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality C CLI entry point that exposes the documented product behavior. + +**Files to create:** +1. `{entry}` - CLI entry point. +2. Headers only when needed to call implemented modules. + +**Critical Rules:** +- Do NOT re-implement business logic in the entry file; delegate to implemented modules. +- Include real project headers and call real symbols. +- Validate arguments and return non-zero for user-facing failures. +- Keep output plain text unless requirements say otherwise. +- This is the ONLY program entry point in the repository. If `{entry}` already exists, extend it in place — do NOT create a second entry file. + +**Requirements:** +1. Provide `int main(int argc, char **argv)`. +2. Implement `--help` and documented commands/options. +3. Delegate storage and task lifecycle behavior to existing C modules. +4. Verify with `make` and `make test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this C project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual C CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Build +- C compiler prerequisite +- `make` instructions + +## 3. Usage +- How to run the compiled binary and `--help` +- Common command examples with expected plain-text output + +## 4. Project Structure +- Brief overview of `src/`, headers, tests, and Makefile targets + +## 5. Development +- How to run `make test` +- How to clean build artefacts + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this C project. +- Base everything on the actual implemented code, not assumptions. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("c") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not path.endswith(_C_SOURCE_EXTENSIONS): + return "main.c" + return path + + def _parse(self, code: str, path: str): + try: + return self._parser().parse_file(self._parse_path(path), code) + except Exception: + return None + + @staticmethod + def _find_compiler() -> str | None: + return shutil.which("cc") or shutil.which("gcc") or shutil.which("clang") + + +__all__ = ["CBackend"] \ No newline at end of file diff --git a/CoderMind/scripts/decoder_lang/cpp_backend.py b/CoderMind/scripts/decoder_lang/cpp_backend.py new file mode 100644 index 0000000..e3dc55a --- /dev/null +++ b/CoderMind/scripts/decoder_lang/cpp_backend.py @@ -0,0 +1,368 @@ +"""Production :class:`LanguageBackend` implementation for C++.""" +from __future__ import annotations + +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +_CPP_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") +_CPP_IDENT_INVALID = re.compile(r"[^A-Za-z0-9_]") +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:TODO|PLACEHOLDER|NOT IMPLEMENTED|throw\s+std::logic_error|abort\s*\()" +) +_CPP_SOURCE_EXTENSIONS = (".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx", ".h") +_CPP_KEYWORDS = frozenset({ + "alignas", "alignof", "and", "asm", "auto", "bool", "break", + "case", "catch", "char", "class", "const", "constexpr", "continue", + "decltype", "default", "delete", "do", "double", "else", "enum", + "explicit", "export", "extern", "false", "float", "for", "friend", + "goto", "if", "inline", "int", "long", "namespace", "new", "noexcept", + "operator", "private", "protected", "public", "return", "short", + "signed", "sizeof", "static", "struct", "switch", "template", "this", + "throw", "true", "try", "typedef", "typename", "union", "unsigned", + "using", "virtual", "void", "volatile", "while", +}) + + +class CppBackend: + """:class:`LanguageBackend` for C++ source.""" + + name = "cpp" + display_name = "C++" + file_extension = ".cpp" + markdown_fence = "cpp" + + def is_source_file(self, path: str) -> bool: + return path.endswith(_CPP_SOURCE_EXTENSIONS) + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return ( + f"/{normalised}".startswith("/tests/") + or "/tests/" in f"/{normalised}" + or basename.endswith(("_test.cpp", "_test.cc", "_test.cxx")) + or basename.startswith("test_") and basename.endswith((".cpp", ".cc", ".cxx")) + ) + + def package_marker_filename(self) -> str | None: + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + if not segment or segment in _CPP_KEYWORDS: + return False + return bool(_CPP_IDENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "_" + cleaned = _CPP_IDENT_INVALID.sub("_", segment) + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + if cleaned in _CPP_KEYWORDS: + cleaned = f"{cleaned}_" + return cleaned + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + return ok and bool(_PLACEHOLDER_RE.search(code)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + return self._parser().validate_syntax(self._parse_path(path), code) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [ + unit for unit in result.units + if unit.unit_type in {"class", "struct", "enum", "function", "method"} + ] + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + code = (getattr(unit, "code", "") or "").strip() + if not code: + return getattr(unit, "name", "") or "" + first = code.split("{", 1)[0].split(";", 1)[0].strip() + return " ".join(first.split()) or (getattr(unit, "name", "") or "") + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "src/main.cpp" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env: EnvHandle) -> None: + from .backend import cmake_reconfigure + + cmake_reconfigure(env) + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + # Compiled CLI: binary name is project-specific; the smoke layer + # locates the built artifact rather than guessing here. + return None + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + cxx = self._find_compiler() + make = shutil.which("make") + cmake = shutil.which("cmake") + ctest = shutil.which("ctest") + if not cxx and not make and not cmake: + return None + root = repo_root.resolve() + return EnvHandle( + project_root=root, + runtime_executable=cmake or make or cxx, + extra={ + "cxx": cxx, + "make": make, + "cmake": cmake, + "ctest": ctest, + "cmake_lists": str(root / "CMakeLists.txt") if (root / "CMakeLists.txt").exists() else None, + "makefile": str(root / "Makefile") if (root / "Makefile").exists() else None, + }, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None or not env.extra.get("cxx"): + raise ToolchainUnavailable("C++ compiler is not available on PATH") + return env + + def test_command(self, env: EnvHandle, selectors: list[str] | None = None) -> list[str]: + ctest = env.extra.get("ctest") if env.extra else None + if ctest and (env.project_root / "CMakeLists.txt").exists(): + return [ctest, "--output-on-failure"] + make = env.extra.get("make") if env.extra else None + if make and (env.project_root / "Makefile").exists(): + return [make, "test"] + cxx = env.extra.get("cxx") if env.extra else None + if not cxx: + raise ToolchainUnavailable("C++ compiler is not available on PATH") + sources = sorted( + str(path) for ext in ("*.cpp", "*.cc", "*.cxx") + for path in env.project_root.rglob(ext) + ) + return [ + cxx, + "-std=c++17", + "-I", + str(env.project_root), + "-Wall", + "-Wextra", + "-fsyntax-only", + *sources, + ] + + def install_deps_command(self, env: EnvHandle, deps: list[str]) -> list[str] | None: + return None + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + # The C++ test command is ctest/make when a harness exists, else a + # bare ``-fsyntax-only`` compile check that legitimately emits no + # output. So empty output is NOT a no-op here (a clean compile is a + # real signal); only ctest's explicit "no tests" / "out of 0" is. + out_of = re.search(r"out of (\d+)", raw) + observed = int(out_of.group(1)) if out_of else None + if ran_no_tests( + exit_code, raw, observed_tests=observed, + no_tests_markers=("No tests were found",), + empty_output_is_no_op=False, + ): + status = "errored" + else: + status = "passed" if exit_code == 0 else "failed" + failures = [] if status != "failed" else [TestFailure( + test_id="c++ test", + short_message="C++ test command failed", + long_message=raw, + )] + fail_match = re.search(r"(\d+)\s+tests?\s+failed", raw) + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=0, + failed_count=( + int(fail_match.group(1)) if fail_match + else (1 if status == "failed" else 0) + ), + error_count=0, + skipped_count=0, + duration_sec=0.0, + failures=failures, + raw_output=raw, + extra={"tool": "ctest, make test, or compiler syntax check"}, + ) + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = CppBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use lowercase snake_case or short descriptive C++ file names; " + "public declarations live in headers and implementations in matching source files." + ), + package_layout_example=( + "CMakeLists.txt\n" + "src/\n" + " main.cpp\n" + " task.cpp\n" + " task.hpp\n" + "tests/\n" + " task_test.cpp\n" + ), + entrypoint_example="src/main.cpp", + test_framework_name="ctest or make test", + style_directive=( + "Write idiomatic C++17: RAII for ownership, const-correct APIs, " + "standard library containers, and clear header/source separation." + ), + ) + CppBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + # Reuse the planner-reconciled entry path when provided (avoids a + # second ``main`` file); else fall back to the canonical path. + entry = context.entry_point_path or "src/main.cpp" + return ProjectTaskTemplates( + dependencies=f"""Generate or update C++ build metadata for the repository: {context.repo_name} + +**Files to create/update:** +1. `CMakeLists.txt` - C++17 project, executable, and test target definitions. + +**Instructions:** +1. Prefer the C++ standard library. +2. Use C++17 unless implemented code requires a newer standard. +3. Provide build and test instructions compatible with CMake and ctest. +4. Keep generated binaries and build directories out of source control. + +**Important:** +- Do NOT create Python dependency files for a C++ project. +- Do NOT introduce third-party dependencies unless the implemented code requires them. +""", + main_entry=f"""Create the C++ command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality C++ CLI entry point that exposes the documented product behavior. + +**Files to create:** +1. `{entry}` - CLI entry point. +2. Headers only when needed to call implemented modules. + +**Critical Rules:** +- Do NOT re-implement business logic in the entry file; delegate to implemented modules. +- Include real project headers and call real symbols. +- Validate arguments and return non-zero for user-facing failures. +- Keep output plain text unless requirements say otherwise. +- This is the ONLY program entry point in the repository. If `{entry}` already exists, extend it in place — do NOT create a second entry file. + +**Requirements:** +1. Provide `int main(int argc, char **argv)`. +2. Implement `--help` and documented commands/options. +3. Delegate storage and task lifecycle behavior to existing C++ modules. +4. Verify with a CMake build and `ctest`, or with `make test` when the project uses a Makefile. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this C++ project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual C++ CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Build +- C++ compiler and CMake prerequisites +- Configure/build commands + +## 3. Usage +- How to run the compiled binary and `--help` +- Common command examples with expected plain-text output + +## 4. Project Structure +- Brief overview of `src/`, headers, tests, and CMake targets + +## 5. Development +- How to run tests with `ctest` or `make test` +- How to clean build artefacts + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this C++ project. +- Base everything on the actual implemented code, not assumptions. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("cpp") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not path.endswith(_CPP_SOURCE_EXTENSIONS): + return "src/main.cpp" + if path.endswith(".h"): + return f"{path[:-2]}.hpp" + return path + + def _parse(self, code: str, path: str): + try: + return self._parser().parse_file(self._parse_path(path), code) + except Exception: + return None + + @staticmethod + def _find_compiler() -> str | None: + return shutil.which("c++") or shutil.which("g++") or shutil.which("clang++") + + +__all__ = ["CppBackend"] \ No newline at end of file diff --git a/CoderMind/scripts/decoder_lang/go_backend.py b/CoderMind/scripts/decoder_lang/go_backend.py new file mode 100644 index 0000000..d1e24f5 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/go_backend.py @@ -0,0 +1,536 @@ +"""Production :class:`LanguageBackend` implementation for Go.""" +from __future__ import annotations + +import logging +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +logger = logging.getLogger(__name__) + +# Go identifier rule: ASCII letters / digits / underscore; cannot start +# with a digit. Hyphens are illegal (unlike many tools' file-name +# conventions, Go's *package* names must be valid identifiers). +_GO_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") +_GO_IDENT_INVALID = re.compile(r"[^A-Za-z0-9_]") +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:return|panic\s*\()\s*(?:\"[^\"]*|`[^`]*|'[^']*)" + r"(?:TODO|PLACEHOLDER|NOT IMPLEMENTED)" +) +_LINE_COMMENT_RE = re.compile(r"//.*?$", re.MULTILINE) +_BLOCK_COMMENT_RE = re.compile(r"/\*.*?\*/", re.DOTALL) +_GO_TEST_RUN_RE = re.compile(r"^===\s+RUN\s+(\S+)") +_GO_TEST_EVENT_RE = re.compile(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)(?:\s+\(([^)]*)\))?") +_GO_TEST_PACKAGE_RE = re.compile(r"^(ok|FAIL)\s+\S+\s+([0-9.]+)s\b") +_GO_TEST_FILE_LINE_RE = re.compile(r"^\s*([^\s:]+_test\.go):(\d+):\s*(.*)$") + +# A short list of Go reserved words. Used only for identifier +# validation; not a parser. Source: Go language spec §"Keywords". +_GO_KEYWORDS = frozenset({ + "break", "case", "chan", "const", "continue", "default", "defer", + "else", "fallthrough", "for", "func", "go", "goto", "if", "import", + "interface", "map", "package", "range", "return", "select", + "struct", "switch", "type", "var", +}) + + +class GoBackend: + """:class:`LanguageBackend` for Go source.""" + + name = "go" + display_name = "Go" + file_extension = ".go" + markdown_fence = "go" + + # ------------------------------------------------------------------ + # 1. File & package layout + # ------------------------------------------------------------------ + + def is_source_file(self, path: str) -> bool: + # ``*_test.go`` are still .go files but the caller is expected + # to use :meth:`is_test_file` to separate tests from sources; + # to mirror :class:`PythonBackend.is_source_file` (which does + # NOT exclude tests), we keep the same convention here. + return path.endswith(".go") + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return basename.endswith("_test.go") + + def package_marker_filename(self) -> str | None: + # Go packages are directories; no marker file required. + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + # Returning None makes call sites skip creation entirely. + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + if not segment or segment in _GO_KEYWORDS: + return False + return bool(_GO_IDENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "_" + cleaned = _GO_IDENT_INVALID.sub("_", segment) + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + # Avoid clashing with a Go keyword by suffixing an underscore; + # never strip user content. + if cleaned in _GO_KEYWORDS: + cleaned = f"{cleaned}_" + return cleaned + + # ------------------------------------------------------------------ + # 2. Code structure + # ------------------------------------------------------------------ + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + if not ok: + return False + stripped = _BLOCK_COMMENT_RE.sub("", _LINE_COMMENT_RE.sub("", code)) + return bool(_PLACEHOLDER_RE.search(stripped)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + parser = self._parser() + return parser.validate_syntax(self._parse_path(path), code) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [ + unit for unit in result.units + if unit.unit_type in {"struct", "interface", "function", "method"} + ] + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + name = getattr(unit, "name", None) or "" + if getattr(unit, "unit_type", None) not in {"function", "method"}: + return name + code = (getattr(unit, "code", "") or "").strip() + if not code: + return name + signature_lines: list[str] = [] + for line in code.splitlines(): + stripped = line.strip() + if not stripped: + continue + if "{" in stripped: + stripped = stripped.split("{", 1)[0].rstrip() + if stripped: + signature_lines.append(stripped) + break + signature_lines.append(stripped) + if getattr(unit, "unit_type", None) in {"struct", "interface"}: + break + if stripped.endswith(")") or stripped.endswith(") error"): + break + if not signature_lines: + return name + return " ".join(signature_lines) + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + slug = self.sanitize_module_identifier(module) if module else "app" + return f"cmd/{slug}/main.go" + + def find_existing_entry(self, interfaces: dict) -> str | None: + """Reuse an existing ``cmd//main.go`` from the skeleton. + + Go places the program entry under ``cmd//main.go``. The + canonical slug (``cmd//main.go``) rarely matches the name + the skeleton actually chose (``cmd/todoapp/main.go``), so a plain + filename match is not enough: this returns the first designed + ``cmd//main.go`` three-segment path so the synthetic + MAIN_ENTRY task extends it instead of creating a second + ``package main``. Falls back to ``None`` (→ backend default) when + the skeleton declared no command package. + """ + if not isinstance(interfaces, dict): + return None + for subtree in interfaces.get("subtrees", {}).values(): + if not isinstance(subtree, dict): + continue + container = subtree.get("interfaces", subtree.get("files", {})) + if not isinstance(container, dict): + continue + for file_path in container: + norm = str(file_path).replace("\\", "/") + parts = norm.split("/") + if len(parts) == 3 and parts[0] == "cmd" and parts[2] == "main.go": + return norm + return None + + def entry_point_candidates(self) -> list[str]: + # Go's entry lives under cmd//main.go; accept any such + # command package, not only the canonical slug. + return ["cmd/*/main.go"] + + def prepare_test_env(self, env: EnvHandle) -> None: + return None + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + if not (repo_root / entry).is_file(): + return None + pkg_dir = str(Path(entry).parent).replace("\\", "/") + return ["go", "run", f"./{pkg_dir}", "--help"] + + # ------------------------------------------------------------------ + # 3. Build / test environment + # ------------------------------------------------------------------ + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + go_exe = shutil.which("go") + if not go_exe: + return None + root = repo_root.resolve() + module_file = root / "go.mod" + return EnvHandle( + project_root=root, + runtime_executable=go_exe, + extra={ + "module_file": str(module_file) if module_file.exists() else None, + "module": self._read_module_name(module_file), + }, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None: + raise ToolchainUnavailable("Go toolchain is not available on PATH") + module_file = env.project_root / "go.mod" + if not module_file.exists(): + module_name = self._default_module_name(env.project_root) + module_file.write_text( + f"module {module_name}\n\ngo 1.22\n", + encoding="utf-8", + ) + return EnvHandle( + project_root=env.project_root, + runtime_executable=env.runtime_executable, + extra={"module_file": str(module_file), "module": module_name}, + ) + return env + + def test_command( + self, + env: EnvHandle, + selectors: list[str] | None = None, + ) -> list[str]: + go_exe = env.runtime_executable or "go" + # ``-v`` makes ``go test`` emit per-test ``=== RUN`` / ``--- PASS`` + # lines that :meth:`parse_test_output` counts. Without it the output + # is only the ``ok `` package summary, so passed_count stays 0 — + # a real run would then look indistinguishable from a zero-test no-op. + cmd = [go_exe, "test", "-v"] + if selectors: + cmd.extend(["-run", "|".join(selectors)]) + cmd.append("./...") + return cmd + + def install_deps_command( + self, + env: EnvHandle, + deps: list[str], + ) -> list[str] | None: + if not deps: + return None + go_exe = env.runtime_executable or "go" + return [go_exe, "get", *deps] + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + passed_count = 0 + failed_count = 0 + skipped_count = 0 + duration_sec = 0.0 + failures: list[TestFailure] = [] + current_test: str | None = None + output_by_test: dict[str, list[str]] = {} + + for line in raw.splitlines(): + started = _GO_TEST_RUN_RE.match(line) + if started: + current_test = started.group(1) + output_by_test.setdefault(current_test, []) + continue + + event = _GO_TEST_EVENT_RE.match(line) + if event: + kind, test_name, duration_text = event.groups() + if kind == "PASS": + passed_count += 1 + elif kind == "SKIP": + skipped_count += 1 + elif kind == "FAIL": + failed_count += 1 + long_message = "\n".join(output_by_test.get(test_name, [])).strip() + file_path, line_number, message = self._failure_location(long_message) + short_message = message or f"{test_name} failed" + failures.append(TestFailure( + test_id=test_name, + short_message=short_message, + long_message=long_message, + file_path=file_path, + line=line_number, + )) + duration_sec += self._parse_duration(duration_text) + current_test = None + continue + + package = _GO_TEST_PACKAGE_RE.match(line) + if package: + duration_sec = max(duration_sec, self._parse_duration(package.group(2))) + continue + + if current_test: + output_by_test.setdefault(current_test, []).append(line) + + if exit_code == 0: + # ``go test ./...`` exits 0 even when it matched no packages + # (empty output) — a no-op that must not pass a gate. A real run + # always emits output, so the empty-output signal catches the + # no-op. Only trust a POSITIVE parsed count as proof tests ran; + # a parsed 0 is ambiguous (the ``-json`` stream may not match the + # text-format regexes), so fall back to the output signal rather + # than false-failing a real run. + observed = passed_count + failed_count + skipped_count + if ran_no_tests(exit_code, raw, observed_tests=observed or None): + status = "errored" + else: + status = "passed" + elif failed_count: + status = "failed" + else: + status = "errored" + + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=passed_count, + failed_count=failed_count, + error_count=0 if status != "errored" else 1, + skipped_count=skipped_count, + duration_sec=duration_sec, + failures=failures, + raw_output=raw, + extra={"tool": "go test"}, + ) + + # ------------------------------------------------------------------ + # 4. Prompt hints + # ------------------------------------------------------------------ + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = GoBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use short, lowercase package directory names with no " + "underscores; tests live next to source as _test.go." + ), + package_layout_example=( + "cmd/\n" + " myapp/\n" + " main.go\n" + "internal/\n" + " core/\n" + " core.go\n" + " core_test.go\n" + "go.mod\n" + ), + entrypoint_example="cmd//main.go", + test_framework_name="go test", + style_directive=( + "Write idiomatic Go: short, lowercase package names; " + "explicit error returns; small interfaces consumed at " + "the call site rather than declared up-front." + ), + ) + GoBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + module_name = context.package_name + # Reuse the planner-reconciled entry path when provided (avoids a + # second command package); else fall back to the canonical path. + command_path = context.entry_point_path or f"cmd/{module_name}/main.go" + # Directory passed to ``go run ./`` — derived from the actual + # command path so docs and verify steps match the real entry. + command_dir = command_path.rsplit("/", 1)[0] if "/" in command_path else "." + return ProjectTaskTemplates( + dependencies=f"""Generate or update Go module dependency files for the repository: {context.repo_name} + +**Files to create/update:** +1. `go.mod` - Go module declaration using module path `{module_name}` +2. `go.sum` - Only if external dependencies are introduced + +**Instructions:** +1. Prefer the Go standard library. Do not add third-party dependencies unless the implemented code already requires them. +2. If there are no external dependencies, create a minimal `go.mod` with a current Go version. +3. If dependencies are needed, run `go mod tidy` after adding imports. +4. Verify the module with `go test ./...`. + +**Important:** +- Do NOT create Python dependency files for a Go project. +- Keep the module compact and local to this repository. +- The fixture expects standard-library-only code unless the implementation proves otherwise. +""", + main_entry=f"""Create the Go command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality Go CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `{command_path}` - Main package for the CLI command. + +**Critical Rules:** +- Do NOT re-implement business logic in `main.go`. Import and delegate to internal packages already defined in the project. +- Every import must reference real packages and symbols from this module. +- Use idiomatic Go error handling with explicit non-zero exits on user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. +- This is the ONLY `package main` / `func main()` in the repository. If `{command_path}` already exists, extend it in place — do NOT create a second command package. +2. Provide `--help` output and subcommands/options that expose all major CLI features. +3. Delegate to implemented internal packages for task storage and task lifecycle behavior. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `go run ./{command_dir} --help` and `go test ./...`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this Go project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual Go CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Go version prerequisite +- Clone/build instructions +- Module setup using `go mod tidy` when needed + +## 3. Usage +- How to run the CLI with `go run ./{command_dir} --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `cmd/`, `internal/`, and test files +- Key packages and their purposes + +## 5. Development +- How to run tests with `go test ./...` +- How to format code with `gofmt` + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual Go codebase to understand what was implemented. +3. Run `go run ./{command_dir} --help` if the command exists. +4. Reference actual package names, types, and functions. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this Go project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("go") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not path.endswith(".go"): + return "main.go" + return path + + def _parse(self, code: str, path: str): + parser = self._parser() + try: + return parser.parse_file(self._parse_path(path), code) + except Exception: + logger.exception("Failed to parse Go source: %s", path) + return None + + @staticmethod + def _parse_duration(duration_text: str | None) -> float: + if not duration_text: + return 0.0 + text = duration_text.rstrip("s") + try: + return float(text) + except ValueError: + return 0.0 + + @staticmethod + def _read_module_name(module_file: Path) -> str | None: + try: + for line in module_file.read_text(encoding="utf-8").splitlines(): + stripped = line.strip() + if stripped.startswith("module "): + return stripped.split(None, 1)[1] + except OSError: + return None + return None + + @staticmethod + def _default_module_name(repo_root: Path) -> str: + raw_name = repo_root.name.lower() + module_leaf = re.sub(r"[^a-z0-9._/-]+", "-", raw_name).strip("-./") + return f"codermind.local/{module_leaf or 'module'}" + + @staticmethod + def _failure_location(text: str) -> tuple[str | None, int | None, str | None]: + for line in text.splitlines(): + match = _GO_TEST_FILE_LINE_RE.match(line) + if match: + file_path, line_number, message = match.groups() + return file_path, int(line_number), message or None + return None, None, None + + +__all__ = ["GoBackend"] diff --git a/CoderMind/scripts/decoder_lang/javascript_backend.py b/CoderMind/scripts/decoder_lang/javascript_backend.py new file mode 100644 index 0000000..7429747 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/javascript_backend.py @@ -0,0 +1,367 @@ +"""Production :class:`LanguageBackend` implementation for JavaScript. + +Mirrors :class:`decoder_lang.typescript_backend.TypeScriptBackend` but +targets plain Node.js JavaScript: ``.js`` / ``.mjs`` / ``.cjs`` sources, +no ``tsconfig.json`` and no type annotations, and the ``javascript`` +:mod:`lang_parser` grammar. Tests run through ``node --test`` / ``npm +test`` exactly like the TypeScript backend. +""" +from __future__ import annotations + +import json +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +_JS_SEGMENT_RE = re.compile(r"^[A-Za-z0-9_$-]+$") +_JS_SEGMENT_INVALID = re.compile(r"[^A-Za-z0-9_$-]") +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:TODO|PLACEHOLDER|NOT IMPLEMENTED|throw\s+new\s+Error\s*\()" +) +_JS_LINE_COMMENT_RE = re.compile(r"//.*?$", re.MULTILINE) +_JS_BLOCK_COMMENT_RE = re.compile(r"/\*.*?\*/", re.DOTALL) + +_JS_SOURCE_SUFFIXES = (".js", ".mjs", ".cjs", ".jsx") +_JS_TEST_SUFFIXES = ( + ".test.js", ".spec.js", + ".test.mjs", ".spec.mjs", + ".test.cjs", ".spec.cjs", + ".test.jsx", ".spec.jsx", +) + + +class JavaScriptBackend: + """:class:`LanguageBackend` for JavaScript source.""" + + name = "javascript" + display_name = "JavaScript" + file_extension = ".js" + markdown_fence = "javascript" + + def is_source_file(self, path: str) -> bool: + return path.endswith(_JS_SOURCE_SUFFIXES) + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return ( + "/tests/" in f"/{normalised}" + or basename.endswith(_JS_TEST_SUFFIXES) + ) + + def package_marker_filename(self) -> str | None: + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + return bool(segment and _JS_SEGMENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "module" + cleaned = _JS_SEGMENT_INVALID.sub("-", segment.strip()) + cleaned = re.sub(r"-+", "-", cleaned).strip("-") + return cleaned or "module" + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + return ok and bool(_PLACEHOLDER_RE.search(code)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + return self._parser().validate_syntax( + self._parse_path(path), + self._parse_source(code), + ) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + parse_path = self._parse_path(path) + result = self._parse(self._parse_source(code), parse_path) + if result is None or result.syntax_error: + return [] + return [ + unit for unit in result.units + if unit.unit_type in {"class", "function", "method"} + ] + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + code = (getattr(unit, "code", "") or "").strip() + if not code: + return getattr(unit, "name", "") or "" + first = code.split("{", 1)[0].split("=>", 1)[0].strip() + return " ".join(first.split()) or (getattr(unit, "name", "") or "") + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(self._parse_source(code), path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(self._parse_source(code), path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "src/index.js" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env) -> None: + return None + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + if (repo_root / "package.json").is_file(): + return ["npm", "start", "--", "--help"] + if (repo_root / entry).is_file(): + return ["node", entry, "--help"] + return None + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + npm = shutil.which("npm") + node = shutil.which("node") + if not npm and not node: + return None + root = repo_root.resolve() + return EnvHandle( + project_root=root, + runtime_executable=npm or node, + extra={"package_json": str(root / "package.json")}, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None: + raise ToolchainUnavailable("Node.js/npm toolchain is not available on PATH") + package_json = env.project_root / "package.json" + if not package_json.exists(): + name = self._default_package_name(env.project_root) + package_json.write_text( + json.dumps({ + "name": name, + "version": "0.1.0", + "type": "module", + "scripts": {"test": "node --test"}, + }, indent=2) + "\n", + encoding="utf-8", + ) + return env + + def test_command(self, env: EnvHandle, selectors: list[str] | None = None) -> list[str]: + executable = env.runtime_executable or "npm" + if Path(executable).name == "node": + return [executable, "--test", *(selectors or [])] + return [executable, "test", *(selectors or [])] + + def install_deps_command(self, env: EnvHandle, deps: list[str]) -> list[str] | None: + if not deps: + return None + executable = env.runtime_executable or "npm" + if Path(executable).name == "node": + return None + return [executable, "install", *deps] + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + # node:test prints a TAP summary ("# tests N", "# pass N", + # "# fail N"). Use the test count to tell a real run from a no-op + # that exits 0 without running anything (which must not pass a gate). + tests_match = re.search(r"(?m)^#?\s*tests\s+(\d+)\b", raw) + observed = int(tests_match.group(1)) if tests_match else None + if ran_no_tests(exit_code, raw, observed_tests=observed): + status = "errored" + else: + status = "passed" if exit_code == 0 else "failed" + failures = [] if status != "failed" else [TestFailure( + test_id="npm test", + short_message="npm test failed", + long_message=raw, + )] + pass_match = re.search(r"(?m)^#?\s*pass\s+(\d+)\b", raw) + fail_match = re.search(r"(?m)^#?\s*fail\s+(\d+)\b", raw) + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=int(pass_match.group(1)) if pass_match else 0, + failed_count=( + int(fail_match.group(1)) if fail_match + else (1 if status == "failed" else 0) + ), + error_count=0, + skipped_count=0, + duration_sec=0.0, + failures=failures, + raw_output=raw, + extra={"tool": "npm test"}, + ) + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = JavaScriptBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use kebab-case or short lowercase directory names; source " + "files live under src/ and tests under tests/ or *.test.js." + ), + package_layout_example=( + "package.json\n" + "src/\n" + " index.js\n" + " cli.js\n" + "tests/\n" + " cli.test.js\n" + ), + entrypoint_example="src/index.js", + test_framework_name="npm test", + style_directive=( + "Write idiomatic modern JavaScript (ES modules): named " + "exports, async-aware APIs, and Node.js standard modules for " + "local CLI/file operations. Do NOT use TypeScript type " + "annotations or .ts files." + ), + ) + JavaScriptBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + # Reuse the planner-reconciled entry path when provided (avoids a + # second entry file); else fall back to the canonical path. + entry = context.entry_point_path or "src/index.js" + return ProjectTaskTemplates( + dependencies=f"""Generate or update Node.js/JavaScript dependency files for the repository: {context.repo_name} + +**Files to create/update:** +1. `package.json` - Package metadata, scripts, and dependencies using package name `{context.package_name}` +2. `package-lock.json` - Only if dependency installation creates it + +**Instructions:** +1. Prefer Node.js standard APIs for local file and CLI behavior. +2. Use ES modules (`"type": "module"`) and the built-in `node --test` runner unless the code needs more. +3. Provide scripts for `npm start` and `npm test` when appropriate. +4. Run `npm test` after updating dependencies. + +**Important:** +- Do NOT create Python dependency files or a `tsconfig.json` for a JavaScript project. +- Do NOT use TypeScript: no type annotations and no `.ts` files. +- Keep dependencies minimal and aligned with actual imports. +""", + main_entry=f"""Create the JavaScript command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality Node.js CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `{entry}` - CLI entry point referenced by package scripts. +2. `src/cli.js` (optional) - Command parsing and dispatch separated from domain logic. + +**Critical Rules:** +- Do NOT re-implement business logic in the entry file. Import and delegate to implemented modules. +- Every import must reference real files and exported symbols. +- Use explicit error handling and non-zero process exits for user-facing failures. +- This is the ONLY program entry point in the repository. If `{entry}` already exists, extend it in place — do NOT create a second entry file. +- Keep output plain text unless the requirements explicitly ask otherwise. + +**Requirements:** +1. Expose all major CLI commands and options described in `docs/`. +2. Wire `package.json` scripts so users can run the CLI with `npm start -- --help`. +3. Delegate storage and task lifecycle behavior to implemented modules. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `npm start -- --help` and `npm test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python entry points or TypeScript files for this JavaScript project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual JavaScript CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Node.js/npm prerequisite +- Clone/install instructions using `npm install` + +## 3. Usage +- How to run the CLI with `npm start -- --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `src/`, `tests/`, and configuration files +- Key modules and their purposes + +## 5. Development +- How to run tests with `npm test` + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual JavaScript codebase to understand what was implemented. +3. Run `npm start -- --help` if package scripts exist. +4. Reference actual exported functions and modules. + +**Important:** +- Do NOT document Python commands or TypeScript tooling for this JavaScript project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("javascript") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not path.endswith(_JS_SOURCE_SUFFIXES): + return "src/index.js" + return path + + @staticmethod + def _parse_source(code: str) -> str: + return _JS_LINE_COMMENT_RE.sub("", _JS_BLOCK_COMMENT_RE.sub("", code)) + + def _parse(self, code: str, path: str): + try: + return self._parser().parse_file(self._parse_path(path), code) + except Exception: + return None + + def _default_package_name(self, repo_root: Path) -> str: + raw = repo_root.name.lower().replace("_", "-").replace(" ", "-") + return self.sanitize_module_identifier(raw) diff --git a/CoderMind/scripts/decoder_lang/project_tasks.py b/CoderMind/scripts/decoder_lang/project_tasks.py new file mode 100644 index 0000000..e68ae65 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/project_tasks.py @@ -0,0 +1,30 @@ +"""Project-level task prompt templates owned by language backends.""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ProjectTaskContext: + """Context used to render language-specific project tasks.""" + + repo_name: str + repo_info: str + package_name: str + # Reconciled program entry-point path. The planner resolves this from + # the already-designed interfaces (e.g. reusing an existing + # ``cmd//main.go`` instead of generating a second command + # package). When ``None``, the backend falls back to its canonical + # entry path. Keeps entry *format* in the backend while letting the + # planner — which alone sees the full interface set — decide *which* + # file, so a single entry source is preserved. + entry_point_path: str | None = None + + +@dataclass(frozen=True) +class ProjectTaskTemplates: + """Rendered project-level task prompts for a target language.""" + + dependencies: str + main_entry: str + readme: str \ No newline at end of file diff --git a/CoderMind/scripts/decoder_lang/prompt_directive.py b/CoderMind/scripts/decoder_lang/prompt_directive.py new file mode 100644 index 0000000..ceec2ea --- /dev/null +++ b/CoderMind/scripts/decoder_lang/prompt_directive.py @@ -0,0 +1,64 @@ +"""Helpers for injecting a language-specific preamble into decoder prompts. + +The ``language_directive`` builder lets prompt-rendering call sites +prepend target-language guidance to a system prompt when the requested +backend is not Python. Prompt templates can keep their normal body +while receiving a compact language preamble at render time. + +Design: + +* When the resolved language is ``"python"`` the directive is the + empty string, so existing Python prompt output is unchanged. +* When the language differs, a short directive (display name, + one-line style note, markdown fence reminder) is prepended so the + LLM receives the target-language constraints before the task prompt. +""" +from __future__ import annotations + +from typing import Optional + +from .backend import LanguageBackend + + +_PYTHON_DEFAULT_NAME = "python" + + +def language_directive(backend: Optional[LanguageBackend]) -> str: + """Return a short preamble appropriate for ``backend``'s language. + + Empty string for Python (and for ``backend is None``) so that + callers can unconditionally prepend the return value without + introducing any diff to the existing Python prompt output. For + every other language the preamble carries the display name, a + style directive, and a markdown-fence reminder so the LLM emits + the right kind of code. + """ + if backend is None or backend.name == _PYTHON_DEFAULT_NAME: + return "" + hints = backend.prompt_hints() + # Compact, neutral-tone preamble. Two newlines after the block so + # it visibly separates from whatever the caller appends next. + lines = [ + f"### Target language: {hints.display_name}", + hints.style_directive.strip(), + ( + f"Emit all code fences as ```{hints.markdown_fence} \u2026 ```. " + f"Source files use the ``{hints.file_extension}`` extension. " + f"Test framework: {hints.test_framework_name}." + ), + "Every code snippet must parse as standalone source for its target file.", + "", + ] + return "\n".join(lines) + "\n" + +def with_language_directive( + system_prompt: str, + backend: Optional[LanguageBackend], +) -> str: + """Convenience: prepend ``language_directive(backend)`` to a system + prompt body. Returns ``system_prompt`` unchanged when the + directive is empty (Python or no backend supplied).""" + directive = language_directive(backend) + if not directive: + return system_prompt + return directive + system_prompt diff --git a/CoderMind/scripts/decoder_lang/prompt_hints.py b/CoderMind/scripts/decoder_lang/prompt_hints.py new file mode 100644 index 0000000..e1fac7c --- /dev/null +++ b/CoderMind/scripts/decoder_lang/prompt_hints.py @@ -0,0 +1,43 @@ +"""Prompt-hint dataclass used by :meth:`LanguageBackend.prompt_hints`. + +Holds the strings the decoder injects into LLM prompts so a single +prompt template can render correctly for any target language. Prompt +renderers use these fields for display names, markdown fences, file +extensions, test framework names, and language-specific style guidance. + +Kept deliberately small: only fields a prompt template can reference +verbatim. Anything that needs computation (e.g. signature extraction) +stays on :class:`LanguageBackend` as a method. +""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class PromptHints: + """Per-language prompt-template fill values. + + All fields are short literals safe to substitute into any prompt + body. Backends construct one instance at module load and return it + unchanged from :meth:`LanguageBackend.prompt_hints` so callers can + cache it without worrying about identity. + """ + + # Identity / display + display_name: str # "Python" / "Go" + markdown_fence: str # "python" / "go" (after ```) + file_extension: str # ".py" / ".go" + + # Layout / naming guidance + module_naming_rule: str # single-sentence rule for file names + package_layout_example: str # short ASCII tree of idiomatic layout + entrypoint_example: str # e.g. "src/main.py" / "cmd//main.go" + + # Test framework + test_framework_name: str # "pytest" / "go test" + + # Free-form preamble injected at the top of every code-gen prompt + # to nudge the LLM toward idiomatic style. Keep concise (≤ 3 + # sentences) to avoid prompt bloat. + style_directive: str diff --git a/CoderMind/scripts/decoder_lang/python_backend.py b/CoderMind/scripts/decoder_lang/python_backend.py new file mode 100644 index 0000000..7ddcc6a --- /dev/null +++ b/CoderMind/scripts/decoder_lang/python_backend.py @@ -0,0 +1,518 @@ +"""Production :class:`LanguageBackend` implementation for Python. + +This is the production Python backend used by the CoderMind decoder +pipeline. It centralizes Python-specific file layout, syntax probes, +code-unit discovery, signature formatting, import extraction, and +prompt hints behind the shared :class:`LanguageBackend` protocol. + +Methods that are not wired into the decoder yet raise +:class:`NotImplementedError` so unsupported calls fail loudly instead +of silently producing incomplete language behaviour. +""" +from __future__ import annotations + +import ast +import keyword +import logging +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestRunResult + +logger = logging.getLogger(__name__) + + +# Hyphen → underscore + invalid char strip used by +# ``sanitize_module_identifier``. Cheap; kept module-level so the +# regex (and its compiled state) is constructed once. +import re + +_PY_IDENT_INVALID = re.compile(r"[^A-Za-z0-9_]") + + +# Placeholder markers shared by static completeness checks and backend +# placeholder detection. +_PLACEHOLDER_MARKERS: tuple[str, ...] = ( + "TODO", + "PLACEHOLDER", + "NOT IMPLEMENTED", +) + + +class PythonBackend: + """:class:`LanguageBackend` for Python source. + + See :class:`decoder_lang.backend.LanguageBackend` for method + contracts. Behaviour matches the pre-existing decoder's Python + assumptions exactly so dropping this backend into a call site is + safe by construction. + """ + + name = "python" + display_name = "Python" + file_extension = ".py" + markdown_fence = "python" + + # ------------------------------------------------------------------ + # 1. File & package layout + # ------------------------------------------------------------------ + + def is_source_file(self, path: str) -> bool: + """A ``*.py`` file that is not under a tests/ tree.""" + # Match the decoder's existing convention: caller is + # responsible for tests/ filtering via :meth:`is_test_file`; + # ``is_source_file`` only checks the extension so the union + # ``is_source_file or is_test_file`` covers everything the + # encoder considers Python. + return path.endswith(".py") + + def is_test_file(self, path: str) -> bool: + """Match common pytest conventions: ``tests/`` directory, + ``test_*.py`` / ``*_test.py`` file names.""" + normalised = path.replace("\\", "/") + if "/tests/" in f"/{normalised}/" or normalised.startswith("tests/"): + return True + basename = normalised.rsplit("/", 1)[-1] + return basename.startswith("test_") or basename.endswith("_test.py") + + def package_marker_filename(self) -> str | None: + return "__init__.py" + + def package_marker_content(self, pkg_path: str) -> str: + # The pre-existing skeleton emits an empty package marker; the + # ``pkg_path`` argument is accepted so future backends (e.g. + # Go ``doc.go``) can include the package name in the body. + return "" + + def is_valid_module_identifier(self, segment: str) -> bool: + """A non-empty Python identifier that is not a reserved keyword.""" + return bool(segment) and segment.isidentifier() and not keyword.iskeyword(segment) + + def sanitize_module_identifier(self, segment: str) -> str: + """Map an arbitrary path segment to a legal Python identifier. + + Rules: + * Replace any non-``[A-Za-z0-9_]`` char with ``_``. + * Prepend ``_`` when the result starts with a digit. + * Empty input becomes ``"_"`` (caller's job to avoid that). + Idempotent. + """ + if not segment: + return "_" + cleaned = _PY_IDENT_INVALID.sub("_", segment) + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + return cleaned + + # ------------------------------------------------------------------ + # 2. Code structure + # ------------------------------------------------------------------ + + def has_placeholder(self, code: str, path: str = "") -> bool: + """Detect ``return "TODO..."`` style placeholder returns. + + Mirrors the placeholder-only check used by + ``static_completeness_check``. Stub-body detection remains in + ``static_checks.py`` because it needs statement-level context. + Returns False on syntax errors so an unparseable file isn't + misreported as containing a placeholder. + """ + try: + tree = ast.parse(code, filename=path) + except (SyntaxError, ValueError): + return False + for node in ast.walk(tree): + if not isinstance(node, ast.Return): + continue + if not isinstance(node.value, ast.Constant): + continue + val = node.value.value + if not isinstance(val, str): + continue + upper = val.upper() + if any(marker in upper for marker in _PLACEHOLDER_MARKERS): + return True + return False + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + """Quick Python syntax probe via :func:`ast.parse`.""" + try: + ast.parse(code, filename=path) + except SyntaxError as exc: + return False, f"{exc.__class__.__name__}: {exc}" + return True, None + + def list_code_units( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Walk the full AST and return every class / function / method + declaration as :class:`LPCodeUnit`. The result includes nested + declarations because ``func_design`` consumers expect a flat + view of every code unit in the source. + + The raw ``ast`` node is preserved in + ``unit.extra["ast_node"]`` so callers that need fine-grained + AST inspection (e.g. signature formatting, decorator lookup) + can read it without re-parsing. + """ + # Local imports to avoid forcing ``lang_parser`` as a top-level + # dependency when this method is unused. + from lang_parser import LPCodeUnit as _LPCodeUnit # type: ignore + + try: + tree = ast.parse(code, filename=path) + except (SyntaxError, ValueError): + return [] + + units: list[Any] = [] + # Track parent class for ``method`` units. ``ast.walk`` yields + # nodes in BFS order without parent info, so we precompute a + # child→parent map first. + parent_map: dict[ast.AST, ast.AST | None] = {tree: None} + for parent in ast.walk(tree): + for child in ast.iter_child_nodes(parent): + parent_map[child] = parent + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + unit_type = "class" + parent_name = None + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + parent_node = parent_map.get(node) + if isinstance(parent_node, ast.ClassDef): + unit_type = "method" + parent_name = parent_node.name + else: + unit_type = "function" + parent_name = None + else: + continue + units.append( + _LPCodeUnit( + name=node.name, + unit_type=unit_type, + file_path=path, + parent=parent_name, + line_start=getattr(node, "lineno", None), + line_end=getattr(node, "end_lineno", getattr(node, "lineno", None)), + code=self._source_for_node(code, node), + language=self.name, + extra={"ast_node": node, "node_type": type(node).__name__}, + ) + ) + return units + + def format_signature(self, unit: Any) -> str: + """Render a function / method ``LPCodeUnit`` into a one-line + signature. Falls back to ``unit.name`` for non-function units + or when the AST node is unavailable. + + Truncates parameter lists longer than four entries (``..., ...``) + and renders the return annotation when present. + """ + if unit is None: + return "" + name = getattr(unit, "name", None) or "" + if getattr(unit, "unit_type", None) not in ("function", "method"): + return name + node = (getattr(unit, "extra", {}) or {}).get("ast_node") + if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + return name + + params: list[str] = [] + for arg in node.args.args: + if arg.arg == "self": + continue + param_str = arg.arg + if arg.annotation is not None: + # ``ast.unparse`` is 3.9+; we already require 3.10+ for + # the decoder but keep the guard for parity with the + # original helper. + type_str = ast.unparse(arg.annotation) if hasattr(ast, "unparse") else "" + if type_str: + param_str = f"{arg.arg}: {type_str}" + params.append(param_str) + + ret_str = "" + if node.returns is not None: + ret_type = ast.unparse(node.returns) if hasattr(ast, "unparse") else "" + if ret_type: + ret_str = f" -> {ret_type}" + + if len(params) > 4: + params_str = ", ".join(params[:3]) + ", ..." + else: + params_str = ", ".join(params) + return f"{name}({params_str}){ret_str}" + + def list_imports( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Extract import statements as :class:`LPDependency` records. + + Mirrors the dependency shape produced by + :class:`lang_parser.python_parser.PythonParser._dependencies_from_import` + so call sites can swap between the two implementations + without re-keying the dict literals. + """ + from lang_parser import LPDependency as _LPDependency # type: ignore + + try: + tree = ast.parse(code, filename=path) + except (SyntaxError, ValueError): + return [] + + deps: list[Any] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + deps.append(_LPDependency( + src=path, + dst=alias.name, + relation="imports", + symbol=alias.asname or alias.name, + line=getattr(node, "lineno", None), + confidence="unresolved", + extra={"module": alias.name, "alias": alias.asname}, + )) + elif isinstance(node, ast.ImportFrom): + module = "." * (node.level or 0) + (node.module or "") + for alias in node.names: + deps.append(_LPDependency( + src=path, + dst=module or None, + relation="imports", + symbol=alias.asname or alias.name, + line=getattr(node, "lineno", None), + confidence="unresolved", + extra={ + "module": module, + "imported": alias.name, + "alias": alias.asname, + }, + )) + return deps + + def list_inheritance( + self, + code: str, + path: str = "", + ) -> list[Any]: + """Extract inheritance edges (``class Child(Base)``) as + :class:`LPDependency` records with ``relation="inherits"``. + + ``src`` is the child class name and ``symbol``/``dst`` the base + name, mirroring the ``inherits`` shape emitted by the + tree-sitter backends so consumers treat every language alike. + """ + from lang_parser import LPDependency as _LPDependency # type: ignore + + try: + tree = ast.parse(code, filename=path) + except (SyntaxError, ValueError): + return [] + + def _base_name(node: ast.expr) -> str | None: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return node.attr + return None + + deps: list[Any] = [] + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef): + continue + for base in node.bases or []: + parent = _base_name(base) + if not parent: + continue + deps.append(_LPDependency( + src=node.name, + dst=parent, + relation="inherits", + symbol=parent, + line=getattr(node, "lineno", None), + confidence="high", + extra={"language": self.name, "child": node.name, "parent": parent}, + )) + return deps + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "main.py" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env) -> None: + return None + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + if not (repo_root / entry).is_file(): + return None + return ["python", entry, "--help"] + + def find_main_block_lineno(self, code: str) -> int | None: + """Return the 1-based line number of the top-level + ``if __name__ == "__main__":`` block, or ``None`` when no such + block exists or the source has a syntax error. + + Python-only helper (not part of the :class:`LanguageBackend` + Protocol). Other languages don't have this concept; callers + feature-detect via ``getattr(backend, 'find_main_block_lineno', + None)`` and skip the splice when absent. + """ + try: + tree = ast.parse(code) + except (SyntaxError, ValueError): + return None + for node in tree.body: + if ( + isinstance(node, ast.If) + and isinstance(node.test, ast.Compare) + and isinstance(node.test.left, ast.Name) + and node.test.left.id == "__name__" + ): + return getattr(node, "lineno", None) + return None + + @staticmethod + def _source_for_node(source: str, node: ast.AST) -> str: + """Slice ``source`` to the lines covered by ``node`` (matches + :meth:`lang_parser.python_parser.PythonParser._source_for_node`).""" + line_start = getattr(node, "lineno", None) + line_end = getattr(node, "end_lineno", line_start) + if line_start is not None and line_end is not None: + lines = source.splitlines() + if 1 <= line_start <= line_end <= len(lines): + return "\n".join(lines[line_start - 1:line_end]) + try: + return ast.unparse(node).strip() + except Exception: + return "" + + # ------------------------------------------------------------------ + # 3. Build / test environment — not wired into the decoder yet + # ------------------------------------------------------------------ + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + """Return an existing Python test environment when supported.""" + raise NotImplementedError( + "PythonBackend.detect_env is not wired into the decoder; " + "callers should keep using code_gen.test_runner.get_dev_python " + "for now.", + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + """Always available on a host that's already running Python + (the decoder itself), so this never raises + :class:`ToolchainUnavailable` once implemented.""" + raise NotImplementedError( + "PythonBackend.ensure_env is not wired into the decoder.", + ) + + def test_command( + self, + env: EnvHandle, + selectors: list[str] | None = None, + ) -> list[str]: + """Return the command used to run Python tests when supported.""" + raise NotImplementedError( + "PythonBackend.test_command is not wired into the decoder; " + "callers should keep using code_gen.batch_prompts." + "build_batch_pytest_cmd for now.", + ) + + def install_deps_command( + self, + env: EnvHandle, + deps: list[str], + ) -> list[str] | None: + """Return a dependency-install command when supported.""" + raise NotImplementedError( + "PythonBackend.install_deps_command is not wired into the decoder.", + ) + + # ------------------------------------------------------------------ + # 4. Test-output parsing — not wired into the decoder yet + # ------------------------------------------------------------------ + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + """Parse native Python test output when backend-driven tests run.""" + raise NotImplementedError( + "PythonBackend.parse_test_output is not wired into the decoder; " + "callers should keep using code_gen.test_output_parser." + "analyze_test_output for now.", + ) + + # ------------------------------------------------------------------ + # 5. Prompt hints + # ------------------------------------------------------------------ + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + """Return cached :class:`PromptHints` for Python. Built lazily + on first call so the dataclass instance is reused across all + prompt renders (callers may use it as a dict key safely).""" + cached = PythonBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use snake_case file and directory names; every package " + "directory must contain an __init__.py." + ), + package_layout_example=( + "src/\n" + " myproject/\n" + " __init__.py\n" + " cli.py\n" + " core.py\n" + "tests/\n" + " test_cli.py\n" + ), + entrypoint_example="src//main.py", + test_framework_name="pytest", + style_directive=( + "Write idiomatic Python 3.10+ code. Prefer dataclasses " + "over plain dicts for structured records; use type " + "annotations on every public function." + ), + ) + PythonBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates( + self, + context: ProjectTaskContext, + ) -> ProjectTaskTemplates | None: + """Return None so plan_tasks can use its Python fallback.""" + return None + + +# Re-export for callers that want the exception without pulling in +# the whole backend module. +__all__ = ["PythonBackend", "ToolchainUnavailable"] diff --git a/CoderMind/scripts/decoder_lang/rust_backend.py b/CoderMind/scripts/decoder_lang/rust_backend.py new file mode 100644 index 0000000..7634463 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/rust_backend.py @@ -0,0 +1,352 @@ +"""Production :class:`LanguageBackend` implementation for Rust.""" +from __future__ import annotations + +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +_RUST_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") +_RUST_IDENT_INVALID = re.compile(r"[^A-Za-z0-9_]") +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:todo!|unimplemented!|panic!\s*\(|TODO|PLACEHOLDER|NOT IMPLEMENTED)" +) +_RUST_KEYWORDS = frozenset({ + "as", "async", "await", "break", "const", "continue", "crate", + "dyn", "else", "enum", "extern", "false", "fn", "for", "if", + "impl", "in", "let", "loop", "match", "mod", "move", "mut", + "pub", "ref", "return", "self", "Self", "static", "struct", + "super", "trait", "true", "type", "unsafe", "use", "where", "while", +}) + + +class RustBackend: + """:class:`LanguageBackend` for Rust source.""" + + name = "rust" + display_name = "Rust" + file_extension = ".rs" + markdown_fence = "rust" + + def is_source_file(self, path: str) -> bool: + return path.endswith(".rs") + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return "/tests/" in f"/{normalised}" or basename.endswith("_test.rs") + + def package_marker_filename(self) -> str | None: + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + if not segment or segment in _RUST_KEYWORDS: + return False + return bool(_RUST_IDENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "_" + cleaned = _RUST_IDENT_INVALID.sub("_", segment) + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + if cleaned in _RUST_KEYWORDS: + cleaned = f"{cleaned}_" + return cleaned + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + return ok and bool(_PLACEHOLDER_RE.search(code)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + return self._parser().validate_syntax(self._parse_path(path), code) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [ + unit for unit in result.units + if unit.unit_type in {"struct", "enum", "trait", "function", "method"} + ] + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + code = (getattr(unit, "code", "") or "").strip() + if not code: + return getattr(unit, "name", "") or "" + first = code.split("{", 1)[0].split(";", 1)[0].strip() + return " ".join(first.split()) or (getattr(unit, "name", "") or "") + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(code, path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "src/main.rs" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env) -> None: + return None + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + if not (repo_root / "Cargo.toml").is_file(): + return None + return ["cargo", "run", "--", "--help"] + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + cargo = shutil.which("cargo") + if not cargo: + return None + root = repo_root.resolve() + manifest = root / "Cargo.toml" + return EnvHandle( + project_root=root, + runtime_executable=cargo, + extra={"manifest": str(manifest) if manifest.exists() else None}, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None: + raise ToolchainUnavailable("Rust toolchain is not available on PATH") + manifest = env.project_root / "Cargo.toml" + if not manifest.exists(): + name = self._default_package_name(env.project_root) + manifest.write_text( + f"[package]\nname = \"{name}\"\nversion = \"0.1.0\"\nedition = \"2021\"\n", + encoding="utf-8", + ) + return EnvHandle( + project_root=env.project_root, + runtime_executable=env.runtime_executable, + extra={"manifest": str(manifest)}, + ) + return env + + def test_command(self, env: EnvHandle, selectors: list[str] | None = None) -> list[str]: + cmd = [env.runtime_executable or "cargo", "test"] + if selectors: + cmd.extend(selectors) + return cmd + + def install_deps_command(self, env: EnvHandle, deps: list[str]) -> list[str] | None: + if not deps: + return None + return [env.runtime_executable or "cargo", "add", *deps] + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + # cargo prints "test result: ok. N passed; M failed; K ignored" per + # test binary. Summing across binaries tells a real run from a no-op + # exit-0 (which must not pass a gate). + totals = re.findall( + r"test result:\s*\w+\.\s*(\d+)\s+passed;\s*(\d+)\s+failed" + r"(?:;\s*(\d+)\s+ignored)?", + raw, + ) + if totals: + passed = sum(int(p) for p, _f, _i in totals) + failed = sum(int(f) for _p, f, _i in totals) + ignored = sum(int(i or 0) for _p, _f, i in totals) + observed: int | None = passed + failed + ignored + else: + passed = failed = ignored = 0 + observed = None + if ran_no_tests(exit_code, raw, observed_tests=observed): + status = "errored" + else: + status = "passed" if exit_code == 0 else "failed" + failures = [] if status != "failed" else [TestFailure( + test_id="cargo test", + short_message="cargo test failed", + long_message=raw, + )] + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=passed, + failed_count=failed if failed else (1 if status == "failed" else 0), + error_count=0, + skipped_count=ignored, + duration_sec=0.0, + failures=failures, + raw_output=raw, + extra={"tool": "cargo test"}, + ) + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = RustBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use snake_case Rust module file names; Cargo entrypoints live " + "in src/main.rs or src/lib.rs." + ), + package_layout_example=( + "Cargo.toml\n" + "src/\n" + " main.rs\n" + " lib.rs\n" + " store.rs\n" + "tests/\n" + " integration_test.rs\n" + ), + entrypoint_example="src/main.rs", + test_framework_name="cargo test", + style_directive=( + "Write idiomatic Rust: explicit Result-based error handling, " + "small modules, ownership-conscious APIs, and structs/enums " + "for domain data." + ), + ) + RustBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + # Reuse the planner-reconciled entry path when provided (avoids a + # second ``main`` file); else fall back to the canonical path. + entry = context.entry_point_path or "src/main.rs" + return ProjectTaskTemplates( + dependencies=f"""Generate or update Rust Cargo dependency files for the repository: {context.repo_name} + +**Files to create/update:** +1. `Cargo.toml` - Cargo package declaration using package name `{context.package_name}` +2. `Cargo.lock` - Only if dependency resolution creates it + +**Instructions:** +1. Prefer the Rust standard library for CLI parsing and file handling unless implemented code already requires a crate. +2. Include `serde` and `serde_json` when JSON serialization is implemented. +3. Use edition `2021` unless the implemented code requires another stable edition. +4. Run `cargo test` after updating dependencies. + +**Important:** +- Do NOT create Python dependency files for a Rust project. +- Keep dependency choices minimal and justified by actual imports. +""", + main_entry=f"""Create the Rust command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality Cargo CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `{entry}` - Binary entry point for the CLI. +2. `src/lib.rs` (optional) - Library module that exposes reusable task/store logic. + +**Critical Rules:** +- Do NOT re-implement business logic in the entry file. Delegate to modules already defined in the crate. +- This is the ONLY binary entry point in the repository. If `{entry}` already exists, extend it in place — do NOT create a second entry file. +- Every `use` path must reference real modules and symbols. +- Use idiomatic `Result`-based error handling and explicit non-zero exits for user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. + +**Requirements:** +1. Provide a `main()` function in `src/main.rs`. +2. Expose all major CLI commands and options described in `docs/`. +3. Delegate storage and task lifecycle behavior to implemented modules. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `cargo run -- --help` and `cargo test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this Rust project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual Rust CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Rust/Cargo prerequisite +- Clone/build instructions +- Dependency setup with `cargo build` + +## 3. Usage +- How to run the CLI with `cargo run -- --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `src/`, modules, and tests +- Key modules and their purposes + +## 5. Development +- How to run tests with `cargo test` +- How to format code with `cargo fmt` + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual Rust codebase to understand what was implemented. +3. Run `cargo run -- --help` if the binary exists. +4. Reference actual module names, structs, traits, enums, and functions. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this Rust project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("rust") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not path.endswith(".rs"): + return "src/lib.rs" + return path + + def _parse(self, code: str, path: str): + try: + return self._parser().parse_file(self._parse_path(path), code) + except Exception: + return None + + def _default_package_name(self, repo_root: Path) -> str: + raw = repo_root.name.lower().replace("-", "_") + return self.sanitize_module_identifier(raw) diff --git a/CoderMind/scripts/decoder_lang/test_result.py b/CoderMind/scripts/decoder_lang/test_result.py new file mode 100644 index 0000000..ab5b637 --- /dev/null +++ b/CoderMind/scripts/decoder_lang/test_result.py @@ -0,0 +1,118 @@ +"""Parser-agnostic test-execution result types. + +Backend-driven test execution returns :class:`TestRunResult` so +decoder stages can reason about pytest, ``go test``, ``cargo test``, +and other native test tools through one result shape. + +Defined here (not in :mod:`code_gen`) so backends can return the type +without an import cycle through the decoder package. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal, Sequence + + +# Status that downstream callers (post_verify / global_review / +# final_validation) branch on. ``skipped`` is reserved for the +# toolchain-unavailable case (non-Python backends only) so the +# caller can mark the verification step as a non-fatal WARN. +TestRunStatus = Literal["passed", "failed", "errored", "skipped"] + + +def ran_no_tests( + exit_code: int, + raw_output: str, + *, + observed_tests: int | None = None, + no_tests_markers: Sequence[str] = (), + empty_output_is_no_op: bool = True, +) -> bool: + """Return True when a test command exited 0 but executed no tests. + + A zero-test run is a no-op, not a pass: it is the native-toolchain + equivalent of pytest collecting zero items. Treating ``exit_code == 0`` + alone as success is how a verification gate silently approves a repo + whose tests never ran — e.g. ``go test ./...`` matching no packages, or + a runner invoked before the sources are in the working tree. + + The check is deliberately fail-safe toward "tests ran" so a real + passing run is never mis-flagged: evidence of *no* tests, in order, + + * empty / whitespace-only output (universal — a real run always emits + progress lines), unless ``empty_output_is_no_op`` is False; + * a tool-specific "no tests" marker phrase in the output; + * a reliably parsed ``observed_tests == 0``. + + ``empty_output_is_no_op`` must be False for backends whose test command + falls back to a compile-only check (C / C++ ``-fsyntax-only``): a clean + compile legitimately produces no output and is the strongest signal + that language has, so empty output there means "passed", not "no-op". + + Backends that cannot parse a trustworthy count pass + ``observed_tests=None`` and rely on the output signals only, so an + unrecognized-but-non-empty output is treated as a pass rather than a + false failure. Only meaningful when ``exit_code == 0``; a non-zero exit + is already a failure reported by the caller. + """ + if exit_code != 0: + return False + text = (raw_output or "").strip() + if not text: + return empty_output_is_no_op + lowered = text.lower() + if any(marker.lower() in lowered for marker in no_tests_markers): + return True + if observed_tests is not None and observed_tests == 0: + return True + return False + + +@dataclass(frozen=True) +class TestFailure: + """One failing test case extracted from the native test tool output.""" + + test_id: str # e.g. "tests/test_foo.py::test_bar" + short_message: str # one-line summary for LLM context + long_message: str = "" # full traceback / failure detail + file_path: str | None = None # file the failure points at, if known + line: int | None = None + + +@dataclass(frozen=True) +class TestRunResult: + """Canonical outcome of a backend-driven test invocation. + + Backends populate ``raw_output`` even on success so callers can + fall back to LLM-driven parsing when structured extraction fails. + ``failures`` is empty when ``status != "failed"``. + """ + + status: TestRunStatus + exit_code: int + passed_count: int = 0 + failed_count: int = 0 + error_count: int = 0 + skipped_count: int = 0 + duration_sec: float = 0.0 + failures: list[TestFailure] = field(default_factory=list) + raw_output: str = "" + # Free-form per-backend diagnostics, e.g. the toolchain name when + # ``status == "skipped"``. Never relied on by generic callers. + extra: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class EnvHandle: + """Opaque environment handle returned by ``detect_env`` / ``ensure_env``. + + Only the backend that produced it interprets ``extra``. Generic + decoder code reads at most ``runtime_executable`` and + ``project_root``; everything else is backend-private metadata + (e.g. Go module cache path, Cargo target directory). + """ + + project_root: Path + runtime_executable: str | None = None + extra: dict[str, Any] = field(default_factory=dict) diff --git a/CoderMind/scripts/decoder_lang/typescript_backend.py b/CoderMind/scripts/decoder_lang/typescript_backend.py new file mode 100644 index 0000000..95f3c3d --- /dev/null +++ b/CoderMind/scripts/decoder_lang/typescript_backend.py @@ -0,0 +1,423 @@ +"""Production :class:`LanguageBackend` implementation for TypeScript.""" +from __future__ import annotations + +import json +import re +import shutil +from pathlib import Path +from typing import Any + +from .backend import ToolchainUnavailable +from .prompt_hints import PromptHints +from .project_tasks import ProjectTaskContext, ProjectTaskTemplates +from .unit_kind import classify_unit_kind +from .test_result import EnvHandle, TestFailure, TestRunResult, ran_no_tests + +_TS_SEGMENT_RE = re.compile(r"^[A-Za-z0-9_$-]+$") +_TS_SEGMENT_INVALID = re.compile(r"[^A-Za-z0-9_$-]") +_TS_INTERFACE_RE = re.compile( + r"^\s*(?:export\s+)?(?:declare\s+)?interface\s+([A-Za-z_$][\w$]*)\b" +) +_TS_TYPE_RE = re.compile( + r"^\s*(?:export\s+)?(?:declare\s+)?type\s+([A-Za-z_$][\w$]*)\b" +) +_TS_DECLARE_FUNCTION_RE = re.compile( + r"^\s*(?:export\s+)?declare\s+function\s+([A-Za-z_$][\w$]*)\b" +) +_TS_DECLARE_CLASS_RE = re.compile( + r"^\s*(?:export\s+)?declare\s+(?:abstract\s+)?class\s+([A-Za-z_$][\w$]*)\b" +) +_TS_ENUM_RE = re.compile( + r"^\s*(?:export\s+)?(?:declare\s+)?enum\s+([A-Za-z_$][\w$]*)\b" +) +_PLACEHOLDER_RE = re.compile( + r"(?is)\b(?:TODO|PLACEHOLDER|NOT IMPLEMENTED|throw\s+new\s+Error\s*\()" +) +_TS_LINE_COMMENT_RE = re.compile(r"//.*?$", re.MULTILINE) +_TS_BLOCK_COMMENT_RE = re.compile(r"/\*.*?\*/", re.DOTALL) + + +class TypeScriptBackend: + """:class:`LanguageBackend` for TypeScript source.""" + + name = "typescript" + display_name = "TypeScript" + file_extension = ".ts" + markdown_fence = "typescript" + + def is_source_file(self, path: str) -> bool: + return (path.endswith(".ts") or path.endswith(".tsx")) and not path.endswith(".d.ts") + + def is_test_file(self, path: str) -> bool: + normalised = path.replace("\\", "/") + basename = normalised.rsplit("/", 1)[-1] + return ( + "/tests/" in f"/{normalised}" + or basename.endswith(".test.ts") + or basename.endswith(".spec.ts") + or basename.endswith(".test.tsx") + or basename.endswith(".spec.tsx") + ) + + def package_marker_filename(self) -> str | None: + return None + + def package_marker_content(self, pkg_path: str) -> str | None: + return None + + def is_valid_module_identifier(self, segment: str) -> bool: + return bool(segment and _TS_SEGMENT_RE.match(segment)) + + def sanitize_module_identifier(self, segment: str) -> str: + if not segment: + return "module" + cleaned = _TS_SEGMENT_INVALID.sub("-", segment.strip()) + cleaned = re.sub(r"-+", "-", cleaned).strip("-") + return cleaned or "module" + + def has_placeholder(self, code: str, path: str = "") -> bool: + ok, _ = self.syntax_check(code, path) + return ok and bool(_PLACEHOLDER_RE.search(code)) + + def syntax_check(self, code: str, path: str = "") -> tuple[bool, str | None]: + return self._parser().validate_syntax( + self._parse_path(path), + self._parse_source(code), + ) + + def list_code_units(self, code: str, path: str = "") -> list[Any]: + parse_path = self._parse_path(path) + result = self._parse(self._parse_source(code), parse_path) + units = [] if result is None or result.syntax_error else [ + unit for unit in result.units + if unit.unit_type in {"class", "function", "method"} + ] + units.extend(self._declaration_units(code, parse_path)) + return units + + def format_signature(self, unit: Any) -> str: + if unit is None: + return "" + code = (getattr(unit, "code", "") or "").strip() + if not code: + return getattr(unit, "name", "") or "" + first = code.split("{", 1)[0].split(";", 1)[0].strip() + return " ".join(first.split()) or (getattr(unit, "name", "") or "") + + def list_imports(self, code: str, path: str = "") -> list[Any]: + result = self._parse(self._parse_source(code), path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "imports"] + + def list_inheritance(self, code: str, path: str = "") -> list[Any]: + result = self._parse(self._parse_source(code), path) + if result is None or result.syntax_error: + return [] + return [dep for dep in result.dependencies if dep.relation == "inherits"] + + def unit_kind(self, unit_name: str) -> str: + return classify_unit_kind(unit_name) + + def is_callable_unit(self, unit_name: str) -> bool: + return classify_unit_kind(unit_name) == "callable" + + def entry_point_path(self, module: str) -> str: + return "src/index.ts" + + def find_existing_entry(self, interfaces: dict) -> str | None: + from .backend import default_find_existing_entry + + return default_find_existing_entry(self, interfaces) + + def entry_point_candidates(self) -> list[str]: + return [self.entry_point_path("")] + + def prepare_test_env(self, env) -> None: + return None + + def entry_run_command(self, repo_root: Path, entry: str) -> list[str] | None: + # Prefer the package-defined start script (handles tsc build/dist). + if (repo_root / "package.json").is_file(): + return ["npm", "start", "--", "--help"] + return None + + def detect_env(self, repo_root: Path) -> EnvHandle | None: + npm = shutil.which("npm") + node = shutil.which("node") + if not npm and not node: + return None + root = repo_root.resolve() + return EnvHandle( + project_root=root, + runtime_executable=npm or node, + extra={"package_json": str(root / "package.json")}, + ) + + def ensure_env(self, repo_root: Path) -> EnvHandle: + env = self.detect_env(repo_root) + if env is None: + raise ToolchainUnavailable("Node.js/npm toolchain is not available on PATH") + package_json = env.project_root / "package.json" + if not package_json.exists(): + name = self._default_package_name(env.project_root) + package_json.write_text( + json.dumps({ + "name": name, + "version": "0.1.0", + "type": "module", + "scripts": {"test": "node --test"}, + }, indent=2) + "\n", + encoding="utf-8", + ) + return env + + def test_command(self, env: EnvHandle, selectors: list[str] | None = None) -> list[str]: + executable = env.runtime_executable or "npm" + if Path(executable).name == "node": + return [executable, "--test", *(selectors or [])] + return [executable, "test", *(selectors or [])] + + def install_deps_command(self, env: EnvHandle, deps: list[str]) -> list[str] | None: + if not deps: + return None + executable = env.runtime_executable or "npm" + if Path(executable).name == "node": + return None + return [executable, "install", *deps] + + def parse_test_output(self, raw: str, exit_code: int) -> TestRunResult: + # node:test prints a TAP summary ("# tests N", "# pass N", + # "# fail N"). Use the test count to tell a real run from a no-op + # that exits 0 without running anything (which must not pass a gate). + tests_match = re.search(r"(?m)^#?\s*tests\s+(\d+)\b", raw) + observed = int(tests_match.group(1)) if tests_match else None + if ran_no_tests(exit_code, raw, observed_tests=observed): + status = "errored" + else: + status = "passed" if exit_code == 0 else "failed" + failures = [] if status != "failed" else [TestFailure( + test_id="npm test", + short_message="npm test failed", + long_message=raw, + )] + pass_match = re.search(r"(?m)^#?\s*pass\s+(\d+)\b", raw) + fail_match = re.search(r"(?m)^#?\s*fail\s+(\d+)\b", raw) + return TestRunResult( + status=status, + exit_code=exit_code, + passed_count=int(pass_match.group(1)) if pass_match else 0, + failed_count=( + int(fail_match.group(1)) if fail_match + else (1 if status == "failed" else 0) + ), + error_count=0, + skipped_count=0, + duration_sec=0.0, + failures=failures, + raw_output=raw, + extra={"tool": "npm test"}, + ) + + _PROMPT_HINTS_SINGLETON: PromptHints | None = None + + def prompt_hints(self) -> PromptHints: + cached = TypeScriptBackend._PROMPT_HINTS_SINGLETON + if cached is not None: + return cached + hints = PromptHints( + display_name=self.display_name, + markdown_fence=self.markdown_fence, + file_extension=self.file_extension, + module_naming_rule=( + "Use kebab-case or short lowercase directory names; source " + "files live under src/ and tests under tests/ or *.test.ts." + ), + package_layout_example=( + "package.json\n" + "tsconfig.json\n" + "src/\n" + " index.ts\n" + " cli.ts\n" + "tests/\n" + " cli.test.ts\n" + ), + entrypoint_example="src/index.ts", + test_framework_name="npm test", + style_directive=( + "Write idiomatic TypeScript: explicit exported types, narrow " + "interfaces, async-aware APIs, and Node.js standard modules " + "for local CLI/file operations." + ), + ) + TypeScriptBackend._PROMPT_HINTS_SINGLETON = hints + return hints + + def project_task_templates(self, context: ProjectTaskContext) -> ProjectTaskTemplates: + # Reuse the planner-reconciled entry path when provided (avoids a + # second entry file); else fall back to the canonical path. + entry = context.entry_point_path or "src/index.ts" + return ProjectTaskTemplates( + dependencies=f"""Generate or update Node.js/TypeScript dependency files for the repository: {context.repo_name} + +**Files to create/update:** +1. `package.json` - Package metadata, scripts, and dependencies using package name `{context.package_name}` +2. `tsconfig.json` - TypeScript compiler configuration for Node.js +3. `package-lock.json` - Only if dependency installation creates it + +**Instructions:** +1. Prefer Node.js standard APIs for local file and CLI behavior. +2. Add TypeScript tooling and a minimal test runner only when needed by the implemented code. +3. Provide scripts for `npm start`, `npm test`, and type checking when appropriate. +4. Run `npm test` after updating dependencies. + +**Important:** +- Do NOT create Python dependency files for a TypeScript project. +- Keep dependencies minimal and aligned with actual imports. +""", + main_entry=f"""Create the TypeScript command entry point for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Create a production-quality Node.js CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `{entry}` - CLI entry point exported or referenced by package scripts. +2. `src/cli.ts` (optional) - Command parsing and dispatch separated from domain logic. + +**Critical Rules:** +- Do NOT re-implement business logic in the entry file. Import and delegate to implemented modules. +- This is the ONLY program entry point in the repository. If `{entry}` already exists, extend it in place — do NOT create a second entry file. +- Every import must reference real files and exported symbols. +- Use explicit error handling and non-zero process exits for user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. + +**Requirements:** +1. Expose all major CLI commands and options described in `docs/`. +2. Wire `package.json` scripts so users can run the CLI with `npm start -- --help`. +3. Delegate storage and task lifecycle behavior to implemented modules. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `npm start -- --help` and `npm test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this TypeScript project. +""", + readme=f"""Update the README.md for the repository: {context.repo_name} +Repository purpose: {context.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual TypeScript CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Node.js/npm prerequisite +- Clone/install instructions using `npm install` +- TypeScript build or runtime notes if applicable + +## 3. Usage +- How to run the CLI with `npm start -- --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `src/`, `tests/`, and configuration files +- Key modules and their purposes + +## 5. Development +- How to run tests with `npm test` +- How to run type checks and build commands if scripts exist + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual TypeScript codebase to understand what was implemented. +3. Run `npm start -- --help` if package scripts exist. +4. Reference actual exported types, functions, and modules. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this TypeScript project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""", + ) + + @staticmethod + def _parser() -> Any: + from lang_parser import get_parser # type: ignore + + return get_parser("typescript") + + @staticmethod + def _parse_path(path: str) -> str: + if path == "" or not (path.endswith(".ts") or path.endswith(".tsx")): + return "src/index.ts" + return path + + @staticmethod + def _parse_source(code: str) -> str: + return _TS_LINE_COMMENT_RE.sub("", _TS_BLOCK_COMMENT_RE.sub("", code)) + + def _parse(self, code: str, path: str): + try: + return self._parser().parse_file(self._parse_path(path), code) + except Exception: + return None + + def _declaration_units(self, code: str, path: str) -> list[Any]: + from lang_parser import LPCodeUnit # type: ignore + + units: list[Any] = [] + lines = code.splitlines() + for index, line in enumerate(lines): + match = ( + _TS_INTERFACE_RE.match(line) + or _TS_TYPE_RE.match(line) + or _TS_DECLARE_FUNCTION_RE.match(line) + or _TS_DECLARE_CLASS_RE.match(line) + or _TS_ENUM_RE.match(line) + ) + if match is None: + continue + unit_type = self._declaration_unit_type(line) + end = self._declaration_end(lines, index) + units.append(LPCodeUnit( + name=match.group(1), + unit_type=unit_type, + file_path=path, + parent=None, + line_start=index + 1, + line_end=end + 1, + code="\n".join(lines[index:end + 1]), + language=self.name, + extra={"kind": unit_type}, + )) + return units + + @staticmethod + def _declaration_unit_type(line: str) -> str: + if "interface" in line: + return "interface" + if "type" in line: + return "type" + if "function" in line: + return "function" + if "class" in line: + return "class" + if "enum" in line: + return "enum" + return "declaration" + + def _declaration_end(self, lines: list[str], start: int) -> int: + depth = 0 + for index in range(start, len(lines)): + depth += lines[index].count("{") - lines[index].count("}") + if depth <= 0 and (";" in lines[index] or "}" in lines[index]): + return index + return start + + def _default_package_name(self, repo_root: Path) -> str: + raw = repo_root.name.lower().replace("_", "-").replace(" ", "-") + return self.sanitize_module_identifier(raw) diff --git a/CoderMind/scripts/decoder_lang/unit_kind.py b/CoderMind/scripts/decoder_lang/unit_kind.py new file mode 100644 index 0000000..95952eb --- /dev/null +++ b/CoderMind/scripts/decoder_lang/unit_kind.py @@ -0,0 +1,78 @@ +"""Shared unit-name classification: callable vs type-like. + +Several decoder stages — orphan detection most importantly — must tell +*callable* units (functions / methods whose normal use is being +**invoked**) apart from *type-like* units (structs / enums / interfaces +/ data classes that are instantiated or referenced as field / parameter +types, but never "called"). + +The distinction matters because the orphan heuristic is +"no incoming invocation edge => dead code". That rule only holds for +callables: a data type legitimately has no incoming *invocation* edge +even when it is used, so flagging it as an orphan is a false positive +(this is the Go ``struct Store`` / ``struct PageData`` case). + +Interface units carry a leading kind token, e.g. ``"function parse"``, +``"method ServeHTTP"``, ``"struct Store"``, ``"class Parser"``. The +helpers here read that token. The default prefix sets are shared by all +current backends (Python / Go / Rust / TypeScript / JavaScript / C / +C++); a backend may pass custom sets if a language introduces a +callable construct under a different keyword. +""" +from __future__ import annotations + +# Units that are normally USED BY BEING CALLED. Orphan detection +# ("no incoming edge => dead") is only meaningful for these. +# +# ``class`` is callable in every OO language the decoder targets +# (Python / JavaScript / TypeScript / C++): the constructor is invoked +# to instantiate it, and the encoder records instantiation as an +# invocation edge, so a used class reliably has an incoming edge. +# Classifying ``class`` as callable keeps dead-class detection working +# with zero false positives on the languages observed. +CALLABLE_UNIT_PREFIXES: frozenset[str] = frozenset({ + "function", + "func", + "method", + "fn", + "class", + "constructor", +}) + +# Units that are TYPES: instantiated, referenced, or used as field / +# parameter types — never "invoked". A type with no incoming +# invocation edge is NOT dead code, so orphan detection must skip it. +TYPE_UNIT_PREFIXES: frozenset[str] = frozenset({ + "struct", + "enum", + "interface", + "trait", + "type", + "union", + "typedef", + "record", +}) + + +def classify_unit_kind( + unit_name: str, + *, + callable_prefixes: frozenset[str] = CALLABLE_UNIT_PREFIXES, + type_prefixes: frozenset[str] = TYPE_UNIT_PREFIXES, +) -> str: + """Classify ``unit_name`` as ``"callable"`` / ``"type"`` / ``"unknown"``. + + ``unit_name`` is expected to carry a leading kind token + (``"function foo"``, ``"struct Bar"``). When the token is missing or + unrecognised the result is ``"unknown"``; callers decide how to + treat it (orphan detection skips non-callable units, staying on the + false-positive-reducing side). + """ + if not unit_name: + return "unknown" + token = unit_name.split(" ", 1)[0].strip().lower() + if token in callable_prefixes: + return "callable" + if token in type_prefixes: + return "type" + return "unknown" diff --git a/CoderMind/scripts/design_base_classes.py b/CoderMind/scripts/design_base_classes.py index df75f5b..71e66b1 100644 --- a/CoderMind/scripts/design_base_classes.py +++ b/CoderMind/scripts/design_base_classes.py @@ -34,8 +34,9 @@ # Import the BaseClassAgent from func_design.base_class_agent import ( BaseClassAgent, - extract_class_names + extract_declaration_names, ) +from decoder_lang import get_backend from rpg import ( RPG, Node, NodeType, EdgeType, NodeMetaData, strip_uuid8, uuid8, class_node_path, @@ -49,6 +50,7 @@ REPO_RPG_FILE ) from common import get_project_background_context +from common.language_meta import extract_language_metadata, metadata_with_languages def load_data_flow() -> Dict[str, Any]: @@ -94,6 +96,8 @@ def update_rpg_with_base_classes(base_classes_data: Dict[str, Any], rpg_path: Pa rpg.remove_nodes_by_generator("design_base_classes") base_classes = base_classes_data.get("base_classes", []) + backend = get_backend(extract_language_metadata(base_classes_data)[0]) + if not base_classes: rpg.save_json(str(rpg_path)) # Save to persist cleanup return @@ -184,8 +188,8 @@ def update_rpg_with_base_classes(base_classes_data: Dict[str, Any], rpg_path: Pa file_nodes[scope_file_key] = file_node added_nodes += 1 - # Extract classes from code - class_names = extract_class_names(code) + # Extract declarations from target-language code. + class_names = extract_declaration_names(code, backend) for class_name in class_names: # Check if class node with same signature already exists under this file @@ -278,6 +282,9 @@ def build( """ # Get repository info repo_name, repo_info = get_repo_info_from_files() + primary_language, _ = extract_language_metadata(skeleton) + if not primary_language: + primary_language = extract_language_metadata(data_flow)[0] # Get project background / technology context project_background = get_project_background_context() @@ -316,7 +323,8 @@ def build( max_iterations=self.max_iterations, logger=self.logger, trajectory=self.trajectory, - step_id=self._current_step_id + step_id=self._current_step_id, + target_language=primary_language, ) result = agent.design_base_classes( @@ -328,6 +336,9 @@ def build( functional_areas_overview=functional_areas_overview, project_background=project_background, ) + result["meta"] = metadata_with_languages( + skeleton if extract_language_metadata(skeleton)[0] else data_flow + ) # Update trajectory if self.trajectory and self._current_step_id: @@ -351,6 +362,8 @@ def print_summary(self, result: Dict[str, Any]) -> None: print("=" * 60) base_classes = result.get("base_classes", []) + backend = get_backend(extract_language_metadata(result)[0]) + class_names = result.get("class_names", []) data_structures = result.get("data_structures", []) ds_class_names = result.get("data_structure_names", []) @@ -365,7 +378,7 @@ def print_summary(self, result: Dict[str, Any]) -> None: for bc in base_classes: file_path = bc.get("file_path", "")[:40] code = bc.get("code", "") - classes = extract_class_names(code) + classes = extract_declaration_names(code, backend) class_str = ", ".join(classes[:3]) if len(classes) > 3: class_str += f" (+{len(classes) - 3})" @@ -382,7 +395,7 @@ def print_summary(self, result: Dict[str, Any]) -> None: for ds in data_structures: subtree = ds.get("subtree", "")[:30] code = ds.get("code", "") - classes = extract_class_names(code) + classes = extract_declaration_names(code, backend) class_str = ", ".join(classes[:3]) if len(classes) > 3: class_str += f" (+{len(classes) - 3})" diff --git a/CoderMind/scripts/design_interfaces.py b/CoderMind/scripts/design_interfaces.py index 2888ca7..aab9c45 100644 --- a/CoderMind/scripts/design_interfaces.py +++ b/CoderMind/scripts/design_interfaces.py @@ -22,7 +22,7 @@ import logging import argparse from pathlib import Path -from typing import Dict, Any, List, Optional +from typing import Callable, Dict, Any, List, Optional # Import trajectory module from common.trajectory import Trajectory, load_or_create_trajectory @@ -34,6 +34,8 @@ # Import Global Interface Reviewer from func_design.interface_review import ( InterfaceReviewer, + check_call_graph_connectivity, + check_feature_dependency_coverage, print_review_summary, ) @@ -53,7 +55,10 @@ ) from common import print_unicode_table, get_repo_info_from_files import ast +import re from common import get_project_background_context +from common.language_meta import extract_language_metadata, metadata_with_languages +from decoder_lang import get_backend from func_design.interface_review import review_orphan_units @@ -102,6 +107,157 @@ def traverse(node): return features +def _collect_skeleton_feature_to_file(skeleton: Dict[str, Any]) -> Dict[str, str]: + """Map each skeleton feature path to the file node that declares it. + + Used by deterministic feature backfill to find which file an + un-attributed feature belongs to (the file whose ``feature_paths`` + list contains it). + """ + mapping: Dict[str, str] = {} + + def traverse(node): + if node.get("type") == "file": + file_path = node.get("path") or node.get("name", "") + for fp in node.get("feature_paths", []): + mapping.setdefault(fp, file_path) + elif node.get("type") == "directory": + for child in node.get("children", []): + traverse(child) + + traverse(skeleton.get("root", skeleton)) + return mapping + + +def _collect_interface_features(interfaces_data: Dict[str, Any]) -> set: + """Return the set of feature paths attributed to some interface unit. + + Mirrors the bench-side consistency check + (``_collect_interface_features`` in ``cmbench/lib/invoker.py``): a + feature is "covered" when it appears in any unit's + ``units_to_features`` list under any subtree. + """ + features: set = set() + subtrees = interfaces_data.get("subtrees", interfaces_data.get("components", {})) + if not isinstance(subtrees, dict): + return features + for subtree_data in subtrees.values(): + if not isinstance(subtree_data, dict): + continue + file_container = subtree_data.get("interfaces", subtree_data.get("files", {})) + if not isinstance(file_container, dict): + continue + for file_data in file_container.values(): + if not isinstance(file_data, dict): + continue + u2f = file_data.get("units_to_features", {}) + if not isinstance(u2f, dict): + continue + for feats in u2f.values(): + for fp in feats or []: + if isinstance(fp, str) and fp.strip(): + features.add(fp) + return features + + +def _select_backfill_unit(units_to_features: Dict[str, Any], feature_path: str) -> Optional[str]: + """Pick the most appropriate unit in a file to receive an orphan feature. + + Deterministic selection (no LLM): the feature is attributed to the + unit whose name tokens overlap most with the feature's leaf segment; + ties (and the zero-overlap case) break toward the unit that already + carries the most features, then lexicographically. Returns ``None`` + when the file has no units to attach to. + """ + if not isinstance(units_to_features, dict) or not units_to_features: + return None + + leaf = feature_path.rsplit("/", 1)[-1].lower() + leaf_tokens = set(leaf.replace("_", " ").replace("-", " ").split()) + + def unit_tokens(unit_name: str) -> set: + bare = unit_name.split(" ", 1)[1] if " " in unit_name else unit_name + # Split camelCase / snake_case into comparable tokens. + spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", bare) + spaced = spaced.replace("_", " ").replace("-", " ").lower() + return set(spaced.split()) + + def score(unit_name: str) -> tuple: + overlap = len(leaf_tokens & unit_tokens(unit_name)) + feature_count = len(units_to_features.get(unit_name) or []) + # Higher overlap first, then more existing features, then stable name. + return (overlap, feature_count, unit_name) + + return max(units_to_features.keys(), key=score) + + +def backfill_uncovered_features( + skeleton: Dict[str, Any], + interfaces_data: Dict[str, Any], +) -> Dict[str, Any]: + """Deterministically attribute skeleton features missing from interfaces. + + The interface designer (LLM) sometimes implements a fine-grained + skeleton feature in code but forgets to list it in any unit's + ``units_to_features``. The bench consistency gate then fails the whole + stage even though the feature is present. This closes that gap WITHOUT + an LLM round-trip: every skeleton feature not attributed to any + interface unit is mapped to its declaring file (via skeleton + ``feature_paths``) and appended to the most appropriate existing unit's + ``units_to_features`` there. + + Only metadata is touched — no units are invented, no code is changed. + Features whose file is absent from interfaces, or whose file has no + units, are reported as ``unbackfilled`` for a downstream WARN. + + Returns an audit dict: ``{"backfilled": [...], "unbackfilled": [...]}``. + """ + skeleton_features = collect_skeleton_features(skeleton) + interface_features = _collect_interface_features(interfaces_data) + uncovered = skeleton_features - interface_features + + audit: Dict[str, Any] = {"backfilled": [], "unbackfilled": []} + if not uncovered: + return audit + + feature_to_file = _collect_skeleton_feature_to_file(skeleton) + subtrees = interfaces_data.get("subtrees", interfaces_data.get("components", {})) + + # Index interface file blocks by file_path for O(1) lookup. + file_blocks: Dict[str, Dict[str, Any]] = {} + if isinstance(subtrees, dict): + for subtree_data in subtrees.values(): + if not isinstance(subtree_data, dict): + continue + container = subtree_data.get("interfaces", subtree_data.get("files", {})) + if isinstance(container, dict): + for fp, fdata in container.items(): + if isinstance(fdata, dict): + file_blocks[fp] = fdata + + for feature in sorted(uncovered): + target_file = feature_to_file.get(feature) + block = file_blocks.get(target_file) if target_file else None + if block is None: + audit["unbackfilled"].append({"feature": feature, "reason": "file not in interfaces"}) + continue + u2f = block.setdefault("units_to_features", {}) + unit = _select_backfill_unit(u2f, feature) + if unit is None: + audit["unbackfilled"].append({"feature": feature, "reason": "file has no units"}) + continue + u2f.setdefault(unit, []) + if feature not in u2f[unit]: + u2f[unit].append(feature) + audit["backfilled"].append({ + "feature": feature, + "file_path": target_file, + "unit": unit, + }) + + return audit + + def collect_rpg_feature_paths(rpg_path: Path) -> set: """Return the set of feature paths present in the current repo_rpg.json. @@ -129,6 +285,67 @@ def collect_rpg_feature_paths(rpg_path: Path) -> set: return paths +def _finalize_global_review_verdict( + global_review: dict, + interfaces_data: dict, + enhanced_data_flow: dict, + entry_points: list[dict], + is_callable: Callable[[str], bool], + retained_keys: set[str], + is_test_file: Optional[Callable[[str], bool]] = None, +) -> None: + """Recompute the published convergence verdict from the FINAL graph. + + ``review_and_fix`` records ``orphan_units_count`` / + ``feature_orphans_count`` / ``passed`` *before* the orphan-review + step adds completion edges and prunes units, so those numbers can be + stale — a since-resolved orphan would otherwise surface as a spurious + WARN downstream. This recomputes them from the post-pruning + ``interfaces_data`` + ``enhanced_data_flow`` using the same type-aware + predicate the gate uses, so the structural gate and the published + numbers always agree (the two previously used different graph + builders and could diverge). + + Units the orphan review explicitly RETAINED (``retained_keys``) are + treated as resolved: a reviewer deemed them necessary (e.g. a public + entry the design keeps), so their lack of an incoming edge must not + fail the verdict. + """ + conn = check_call_graph_connectivity( + interfaces_data, enhanced_data_flow, entry_points, + is_callable=is_callable, + is_test_file=is_test_file, + ) + feats = check_feature_dependency_coverage( + interfaces_data, enhanced_data_flow, entry_points, + is_callable=is_callable, + is_test_file=is_test_file, + ) + orphan_keys = [ + u["unit_key"] for u in conn["orphan_units"] + if u["unit_key"] not in retained_keys + ] + feat_orphans = [ + f for f in feats + if f"{f.get('file_path', '')}::{f.get('unit_name', '')}" not in retained_keys + ] + # Advisory ``modify_interface`` requests never gate the verdict; only + # genuinely-unapplied wiring (``blocking_unapplied_fixes_count``) does. + blocking_unapplied = global_review.get( + "blocking_unapplied_fixes_count", + global_review.get("unapplied_fixes_count", 0), + ) + global_review["orphan_units_count"] = len(orphan_keys) + global_review["feature_orphans_count"] = len(feat_orphans) + global_review["unresolved_orphan_units"] = orphan_keys + global_review["unresolved_orphan_features"] = feat_orphans + global_review["passed"] = ( + len(orphan_keys) == 0 + and len(feat_orphans) == 0 + and blocking_unapplied == 0 + ) + + def extract_known_classes_and_types(base_classes: Dict[str, Any]) -> tuple: """Extract known base class names and type names from base_classes.json. @@ -758,6 +975,14 @@ def build( # Get base classes list base_classes_list = base_classes.get("base_classes", []) data_structures_list = base_classes.get("data_structures", []) + primary_language, _ = extract_language_metadata(skeleton) + if not primary_language: + primary_language = extract_language_metadata(data_flow)[0] + if not primary_language: + primary_language = extract_language_metadata(base_classes)[0] + metadata_source = skeleton + if not extract_language_metadata(metadata_source)[0]: + metadata_source = data_flow if extract_language_metadata(data_flow)[0] else base_classes # Extract known classes and types for dependency analysis known_base_classes, known_types = extract_known_classes_and_types(base_classes) @@ -765,7 +990,8 @@ def build( # Initialize dependency collector dependency_collector = DependencyCollector( known_base_classes=known_base_classes, - known_types=known_types + known_types=known_types, + target_language=primary_language, ) # Store original data flow edges @@ -778,7 +1004,8 @@ def build( logger=self.logger, trajectory=self.trajectory, step_id=self._current_step_id, - output_path=self.output_path + output_path=self.output_path, + target_language=primary_language, ) result = orchestrator.design_all_interfaces( @@ -789,9 +1016,10 @@ def build( dependency_collector=dependency_collector, data_structures=data_structures_list ) + result["meta"] = metadata_with_languages(metadata_source) # ===================================================================== - # Phase 1.5: Post-process invocation edges (normalise + resolve) + # Post-process invocation edges (normalise + resolve) # ===================================================================== global_registry = result.get("_global_registry") if global_registry: @@ -808,12 +1036,21 @@ def build( self.logger.info(f"Collected dependencies: {dep_summary}") # ===================================================================== - # Phase 2: Global Interface Review (entry points + wiring + auto-fix) + # Global Interface Review (entry points + wiring + auto-fix) # ===================================================================== global_registry = result.pop("_global_registry", None) import_warnings = result.pop("_import_warnings", []) - if global_registry and result.get("success"): + review_language = primary_language or "python" + # The global interface review runs for every language. Its structural + # checks (call-graph connectivity, feature coverage), dependency-edge + # fixes (add_dependency), and orphan pruning are language-agnostic. + # The only Python-specific capability — automatic interface-stub + # synthesis (add_interface) — is skipped inside the reviewer for + # non-Python backends, so non-Python projects still get full + # diagnostics and dependency wiring without invalid stub injection. + review_enabled = bool(global_registry) and result.get("success") + if review_enabled: self.logger.info("Starting global interface review phase...") print("\n" + "=" * 70) print("GLOBAL INTERFACE REVIEW") @@ -822,6 +1059,7 @@ def build( reviewer = InterfaceReviewer( trajectory=self.trajectory, step_id=self._current_step_id, + target_language=review_language, ) review_result = reviewer.review_and_fix( @@ -831,7 +1069,11 @@ def build( import_warnings=import_warnings, data_flow_edges=data_flow.get("data_flow", []), dependency_collector=dependency_collector, - max_fix_iterations=2, + # Three review-fix cycles give coverage gaps (LLM under-listing + # a unit's ``features``) an extra chance to converge before the + # gate reports a feature-orphan WARN; two was often too few for + # web projects with many fine-grained presentation features. + max_fix_iterations=3, skeleton_features=collect_skeleton_features(skeleton), rpg_features=collect_rpg_feature_paths(REPO_RPG_FILE), ) @@ -845,6 +1087,8 @@ def build( "feature_orphans_count": len(review_result.get("final_feature_orphans", [])), "orphan_units_count": len(review_result.get("final_orphan_units", [])), "unapplied_fixes_count": len(review_result.get("unapplied_fixes", [])), + "advisory_fixes_count": len(review_result.get("advisory_fixes", [])), + "blocking_unapplied_fixes_count": len(review_result.get("blocking_unapplied_fixes", [])), "unapplied_fixes": review_result.get("unapplied_fixes", []), "iterations_run": review_result.get("iterations_run", 0), "passed": review_result.get("passed", False), @@ -858,7 +1102,7 @@ def build( print_review_summary(review_result) # ================================================================= - # Phase 3: Create InterfacesStore and prune orphans + # Create InterfacesStore and prune orphans # ================================================================= # Create unified store from current result store = InterfacesStore.from_legacy_format( @@ -870,11 +1114,12 @@ def build( ) # ================================================================= - # Phase 3b: Review and prune orphan units + # Review and prune orphan units # ================================================================= # First, find orphan units orphan_keys = store.find_orphan_units() - prune_summary = None # Initialize to None + orphan_review_result = None + prune_summary = None if orphan_keys: print(f"\nFound {len(orphan_keys)} orphan interface units (no call edges)") @@ -933,7 +1178,7 @@ def build( result["global_review"]["retained_orphans_count"] = len(orphan_review_result.keys_to_retain) # ================================================================= - # Phase 4: Update result from store and update RPG + # Update result from store and update RPG # ================================================================= # Update result with store's current state (reflects pruning) store_export = store.to_interfaces_json() @@ -954,12 +1199,57 @@ def build( rpg_summary.pruned_feature_nodes + rpg_summary.pruned_parent_nodes ) + # Deterministically attribute any skeleton feature the designer + # implemented but forgot to list in a unit's units_to_features. + # Runs before the verdict recompute so coverage reflects the + # backfill. Metadata-only: no units invented, no code touched. + backfill_audit = backfill_uncovered_features(skeleton, result) + if backfill_audit["backfilled"] or backfill_audit["unbackfilled"]: + result["global_review"]["backfilled_features"] = backfill_audit["backfilled"] + result["global_review"]["unbackfilled_features"] = backfill_audit["unbackfilled"] + self.logger.info( + "Feature backfill: %d attributed, %d unbackfillable", + len(backfill_audit["backfilled"]), + len(backfill_audit["unbackfilled"]), + ) + + # Refresh the published verdict from the FINAL graph (after + # orphan-review completion edges + pruning), honouring units + # the review explicitly retained. Runs unconditionally so the + # numbers never lag review_and_fix's pre-edge snapshot. + _retained_keys = ( + set(orphan_review_result.keys_to_retain) + if orphan_review_result is not None else set() + ) + _finalize_global_review_verdict( + global_review=result["global_review"], + interfaces_data=result, + enhanced_data_flow=result["enhanced_data_flow"], + entry_points=review_result.get("final_entry_points", []), + is_callable=get_backend(review_language).is_callable_unit, + retained_keys=_retained_keys, + is_test_file=get_backend(review_language).is_test_file, + ) + # Update dependency summary dep_summary = store.get_stats() self.logger.info(f"Final store stats: {dep_summary}") else: if not global_registry: self.logger.info("GlobalInterfaceRegistry not available, skipping global review") + elif result.get("success") and review_language != "python": + self.logger.info( + "Skipping global interface review for target language: %s", + review_language, + ) + result["global_review"] = { + "passed": True, + "skipped": True, + "reason": ( + "Global interface review currently supports Python " + "interface repair only." + ), + } # Update trajectory if self.trajectory and self._current_step_id: @@ -1089,12 +1379,29 @@ def print_summary(self, result: Dict[str, Any]) -> None: # Stage 2: Coverage # ------------------------------------------------------------------ print("\n[Stage 2] Coverage — is every skeleton feature mapped to some unit?") - # NOTE: We can compute the skeleton-feature universe from - # interfaces.json alone (every unit declares its features); for a - # canonical count we'd need skeleton.json again. The cross-validate - # path already runs `check_interfaces.py` for that; here we just - # report what the produced data carries. - print(f" Distinct features mapped to a unit: {len(all_unit_features)}") + coverage = result.get("coverage", {}) or {} + if coverage: + expected_features = coverage.get("expected_features", 0) + covered_features = coverage.get("covered_features", 0) + expected_files = coverage.get("expected_files", 0) + successful_files = coverage.get("successful_files", 0) + print(f" Files fully covered: {successful_files}/{expected_files}") + print(f" Features covered: {covered_features}/{expected_features}") + issues = coverage.get("issues", []) or [] + if issues: + print(f" [WARNING] {len(issues)} coverage issue(s):") + for issue in issues[:10]: + file_path = issue.get("file_path") or "(subtree)" + missing = issue.get("missing_features", []) or [] + print(f" - {file_path}: {issue.get('reason', 'incomplete')}") + for feature in missing[:3]: + print(f" * {feature}") + if len(missing) > 3: + print(f" ... and {len(missing) - 3} more") + if len(issues) > 10: + print(f" ... and {len(issues) - 10} more") + else: + print(f" Distinct features mapped to a unit: {len(all_unit_features)}") if not all_unit_features: print(" [WARNING] no feature mappings at all — likely Stage 1 failed") @@ -1110,9 +1417,16 @@ def print_summary(self, result: Dict[str, Any]) -> None: orphan_units = global_review.get("orphan_units_count", 0) orphan_features = global_review.get("feature_orphans_count", 0) unapplied_count = global_review.get("unapplied_fixes_count", 0) + advisory_count = global_review.get("advisory_fixes_count", 0) + blocking_count = global_review.get( + "blocking_unapplied_fixes_count", unapplied_count + ) print(f" Orphan units (no incoming edges): {orphan_units}") print(f" Orphan features (no unit reachable from entry): {orphan_features}") - print(f" Unapplied fix requests: {unapplied_count}") + print( + f" Unapplied fix requests: {unapplied_count} " + f"({blocking_count} blocking, {advisory_count} advisory)" + ) for u in global_review.get("unapplied_fixes", [])[:3]: print(f" - [{u.get('action','?')}] " f"{u.get('file_path','?')}::{u.get('unit_name','?')}" @@ -1158,8 +1472,14 @@ def print_summary(self, result: Dict[str, Any]) -> None: # ------------------------------------------------------------------ # Verdict — mirrors Stage 3's `passed`. # ------------------------------------------------------------------ - passed = bool(global_review.get("passed")) - verdict = "✓ PASS" if passed else "✗ FAIL — see Stage 3 above" + generation_passed = result.get("success", True) + passed = generation_passed and bool(global_review.get("passed")) + if passed: + verdict = "✓ PASS" + elif not generation_passed: + verdict = "✗ FAIL — interface coverage incomplete" + else: + verdict = "✗ FAIL — see Stage 3 above" print(f"\nOverall: {verdict}") print("(Stage 3 is the strictest; PASS requires Stages 1+2 also clean.)") print("=" * 60) @@ -1308,9 +1628,10 @@ def main(): # RPG update is now handled inside InterfaceDesigner.build() via InterfacesStore - if not result.get("success", True) and "error" in result: + if not result.get("success", True): + error = result.get("error", "Interface coverage incomplete") if trajectory: - trajectory.fail(result["error"]) + trajectory.fail(error) return 1 # Mark trajectory as complete diff --git a/CoderMind/scripts/feature/prompts/__init__.py b/CoderMind/scripts/feature/prompts/__init__.py index 1e027fb..b025164 100644 --- a/CoderMind/scripts/feature/prompts/__init__.py +++ b/CoderMind/scripts/feature/prompts/__init__.py @@ -1,10 +1,9 @@ """LLM prompt templates for feature-related stages. -This package collects prompt templates grouped by stage. The historical -single-file module ``feature/prompts.py`` is preserved as -``feature.prompts.legacy`` and re-exported here so existing imports -(``from feature.prompts import PROMPT_TEMPLATE_BUILD_FEATURE`` etc.) -continue to work unchanged. +This package collects prompt templates grouped by stage. Compatibility +prompts live in ``feature.prompts.legacy`` and are re-exported here so +existing imports (``from feature.prompts import PROMPT_TEMPLATE_BUILD_FEATURE`` +etc.) continue to work unchanged. New stages add their prompts in dedicated submodules and re-export from here as needed. diff --git a/CoderMind/scripts/feature/prompts/spec.py b/CoderMind/scripts/feature/prompts/spec.py index 4b65a6c..c8cc962 100644 --- a/CoderMind/scripts/feature/prompts/spec.py +++ b/CoderMind/scripts/feature/prompts/spec.py @@ -1,14 +1,9 @@ """LLM prompt templates for the ``feature_spec`` stage. -The Phase-1 ``feature_spec`` stage converts raw requirements (either a free- -form user description or a set of ``docs/*.md`` files) into a single, -strictly-validated ``feature_spec.json``. - -These prompts replace the historical 1008-line -``templates/commands/feature_spec.md`` slash-command document. The -intermediate Markdown artefacts (``evidence/*.md``, ``feature_spec.md``, -``features/FT-*.md``) are *no longer generated*; the LLM emits the final -JSON directly, validated against ``feature.schemas.spec.FeatureSpecOutput``. +The ``feature_spec`` stage converts raw requirements (either a free-form +user description or a set of ``docs/*.md`` files) into a single, +strictly-validated ``feature_spec.json``. The LLM emits the final JSON +directly, validated against ``feature.schemas.spec.FeatureSpecOutput``. Schema knowledge — field meanings, ID conventions, MIU principle, etc. — lives both here (in the prompt body) and in the Pydantic ``Field`` @@ -48,6 +43,8 @@ class Meta(BaseModel): project_notes: str # ≤500 chars generated_at: str # "YYYY-MM-DD" source_documents: list[str] # ["doc1.md", ...] or ["user_input"] + primary_language: str | None = None + target_languages: list[str] = [] class BackgroundItem / NfrItem(BaseModel): id: str # "BG-NNN" / "NFR-NNN" (1-based, zero-padded) @@ -96,6 +93,11 @@ class FeatureSpecOutput(BaseModel): - ``repository_name``: concise, kebab-case, 1-3 words (e.g. ``todo-list-app``). - ``repository_purpose``: 1-2 sentences capturing the core objective. +- ``meta.primary_language``: primary implementation language in lowercase + (e.g. ``python``, ``go``, ``typescript``, ``rust``, ``c``, ``cpp``). +- ``meta.target_languages``: all implementation languages in priority order; + include ``meta.primary_language`` as the first item. For single-language + projects this is a one-item list. ### Background & NFR diff --git a/CoderMind/scripts/feature/schemas/spec.py b/CoderMind/scripts/feature/schemas/spec.py index b5773b5..1bd9ac3 100644 --- a/CoderMind/scripts/feature/schemas/spec.py +++ b/CoderMind/scripts/feature/schemas/spec.py @@ -1,8 +1,7 @@ """Pydantic schemas for ``feature_spec.json``. -The schema mirrors the historical ``feature_spec.json`` shape produced by -``feature_spec_to_json.py`` so that downstream stages (``feature_build``, -``build_skeleton``, …) can consume it without modification. +The schema preserves the ``feature_spec.json`` contract consumed by +downstream stages (``feature_build``, ``build_skeleton``, …). Reference sample:: @@ -32,7 +31,9 @@ from typing import List, Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator + +from common.language_meta import normalize_language_metadata ProjectType = Literal[ @@ -110,6 +111,31 @@ class Meta(BaseModel): "``['01_charter.md','02_spec.md']`` or ``['user_input']``." ), ) + primary_language: str | None = Field( + default=None, + description=( + "Primary target programming language for code generation, e.g. " + "``\"python\"`` / ``\"go\"`` / ``\"typescript\"``." + ), + ) + target_languages: List[str] = Field( + default_factory=list, + description=( + "All programming languages expected in the generated repository. " + "The primary language is listed first." + ), + ) + + @model_validator(mode="after") + def _normalize_language_metadata(self) -> "Meta": + """Keeps primary and list language metadata consistent.""" + primary, languages = normalize_language_metadata( + self.primary_language, + self.target_languages, + ) + self.primary_language = primary + self.target_languages = languages + return self class BackgroundItem(BaseModel): @@ -217,8 +243,8 @@ class FeatureNode(BaseModel): class FeatureSpecOutput(BaseModel): """Top-level model representing the full ``feature_spec.json``. - Field order intentionally mirrors the historical sample to maximise - diff-friendliness when comparing old vs new outputs. + Field order intentionally follows the reference sample to maximise + diff-friendliness across generated outputs. """ meta: Meta @@ -239,6 +265,15 @@ class FeatureSpecOutput(BaseModel): "repository." ), ) + @property + def target_language(self) -> str | None: + """The primary target programming language.""" + return self.meta.primary_language + + @property + def target_languages(self) -> List[str]: + """The ordered target programming language list.""" + return self.meta.target_languages __all__ = [ diff --git a/CoderMind/scripts/feature/spec.py b/CoderMind/scripts/feature/spec.py index 4ff8db1..2081255 100644 --- a/CoderMind/scripts/feature/spec.py +++ b/CoderMind/scripts/feature/spec.py @@ -1,11 +1,9 @@ -"""Phase-1 ``feature_spec`` stage — direct JSON generation. +"""Generate ``feature_spec.json`` directly from raw requirements. Reads raw requirements (inline text or ``docs/*.md`` files), drives an LLM via :class:`LLMClient`, and writes a validated ``feature_spec.json`` ready for downstream stages (``feature_build`` etc.) to consume. -This module replaces the historical Markdown-intermediary pipeline -(``feature_spec.md`` slash command + ``feature_spec_to_json.py`` parser). The LLM emits the final JSON directly, validated against :class:`feature.schemas.spec.FeatureSpecOutput`. @@ -29,12 +27,14 @@ import json import logging +import re from dataclasses import dataclass, field from datetime import date from pathlib import Path from typing import List, Optional from common.llm_client import LLMClient +from common.language_meta import normalize_language_metadata from common.paths import FEATURE_SPEC_FILE, WORKSPACE_ROOT from common.trajectory import load_or_create_trajectory @@ -216,7 +216,50 @@ def _call_llm( "LLM failed to produce a valid feature_spec.json after " f"{max_retries} attempts (see LLM trace logs for details)." ) - return result + inferred = _infer_target_languages(source) + if inferred and not result.meta.target_languages: + result.meta.target_languages = inferred + if inferred and not result.meta.primary_language: + result.meta.primary_language = inferred[0] + primary, languages = normalize_language_metadata( + result.meta.primary_language, + result.meta.target_languages, + ) + result.meta.primary_language = primary + result.meta.target_languages = languages + return FeatureSpecOutput.model_validate(result.model_dump()) + + +def _infer_target_languages(source: InputSource) -> list[str]: + """Infers implementation languages from requirement text.""" + text = _source_text(source).lower() + patterns = [ + ("typescript", r"\btypescript\b|\bts\b"), + ("javascript", r"\bjavascript\b|\bnode(?:\.js)?\b"), + ("go", r"\bgolang\b|\bgo\.mod\b|\bgo (?:test|run|build)\b|\bgo language\b|\bgo project\b"), + ("rust", r"\brust\b|\bcargo\b"), + ("cpp", r"\bc\+\+\b|\bcpp\b"), + ("c", r"\bc language\b|\bc project\b"), + ("python", r"(? str: + """Returns all source requirement text as one string.""" + if source.kind == "user_input": + return source.text or "" + chunks: list[str] = [] + for doc in source.docs: + try: + chunks.append(doc.read_text(encoding="utf-8")) + except OSError: + continue + return "\n".join(chunks) # =========================================================================== diff --git a/CoderMind/scripts/feature_build.py b/CoderMind/scripts/feature_build.py index 5782f75..89126a3 100644 --- a/CoderMind/scripts/feature_build.py +++ b/CoderMind/scripts/feature_build.py @@ -24,6 +24,7 @@ ) from common import print_unicode_table, get_all_leaf_paths, get_leaf_name, get_all_leaf_descriptions from common.llm_client import LLMClient +from common.language_meta import extract_language_metadata, metadata_with_languages from common.trajectory import load_or_create_trajectory # ======================== Configuration ======================== @@ -269,6 +270,38 @@ def convert_leaves_to_list(tree: Dict[str, Any]) -> Dict[str, Any]: return result +def _list_leaf_to_branch(item: Any) -> tuple[str | None, Any]: + """Converts a list leaf into a branch entry.""" + if isinstance(item, str): + return item.strip(), [] + if isinstance(item, dict): + name = item.get("name") + if isinstance(name, str) and name.strip(): + children = item.get("children", []) + if isinstance(children, (dict, list)): + return name.strip(), copy.deepcopy(children) + return name.strip(), [] + if len(item) == 1: + key, value = next(iter(item.items())) + if isinstance(key, str) and key.strip(): + if isinstance(value, (dict, list)): + return key.strip(), copy.deepcopy(value) + return key.strip(), [] + if item is None: + return None, [] + return str(item).strip(), [] + + +def _list_leaves_to_branch(items: List[Any]) -> Dict[str, Any]: + """Promotes list leaves to a branch map.""" + branch: Dict[str, Any] = {} + for item in items: + key, value = _list_leaf_to_branch(item) + if key: + branch.setdefault(key, value) + return branch + + def apply_changes(tree: Dict[str, Any], paths: List[str]) -> Dict[str, Any]: """Apply path list to tree structure. @@ -298,10 +331,8 @@ def apply_changes(tree: Dict[str, Any], paths: List[str]) -> Dict[str, Any]: if part not in current: current[part] = {} elif isinstance(current[part], list): - # If we encounter a list, we need to convert it to a dict - # This happens when a previous leaf node needs to become a branch old_list = current[part] - current[part] = {item: {} for item in old_list} + current[part] = _list_leaves_to_branch(old_list) elif not isinstance(current[part], dict): # Unexpected type, convert to dict current[part] = {} @@ -320,9 +351,8 @@ def apply_changes(tree: Dict[str, Any], paths: List[str]) -> Dict[str, Any]: if leaf not in current[parent_key]: current[parent_key].append(leaf) elif isinstance(current[parent_key], dict): - # If it's a dict (previously a branch node), we have a conflict - # This means some paths treat this as a leaf parent, others as a branch - # Keep it as a dict and add the leaf as a key with empty value + # This path segment is used both as a leaf parent and as a + # branch. Keep the branch shape and add the leaf key. if leaf not in current[parent_key]: current[parent_key][leaf] = [] else: @@ -866,6 +896,11 @@ def _extract_paths_and_descs(items: List) -> Tuple[List[str], Dict[str, str]]: return paths, desc_map +def _target_languages(data: Dict[str, Any]) -> List[str]: + """Returns normalized target language list from feature data.""" + return extract_language_metadata(data)[1] + + def _save_intermediate( feature_tree: Dict[str, Any], current_tree: Dict[str, Any], @@ -879,6 +914,7 @@ def _save_intermediate( "repository_name": feature_tree.get("repository_name", "unknown"), "repository_purpose": feature_tree.get("repository_purpose", ""), "repository_specification": feature_tree.get("repository_specification", ""), + "meta": metadata_with_languages(feature_tree), "feature_tree": current_tree, "previous_feature_tree": previous_feature_tree, "iteration_logs": iteration_logs, @@ -1487,6 +1523,7 @@ def build_from_spec( "repository_name": feature_tree.get("repository_name", "unknown"), "repository_purpose": feature_tree.get("repository_purpose", ""), "repository_specification": feature_tree.get("repository_specification", ""), + "meta": metadata_with_languages(feature_tree), "feature_tree": current_tree, "previous_feature_tree": previous_feature_tree, "iteration_logs": iteration_logs, @@ -1678,6 +1715,7 @@ def expand_with_direction( "repository_name": feature_tree.get("repository_name", "unknown"), "repository_purpose": feature_tree.get("repository_purpose", ""), "repository_specification": feature_tree.get("repository_specification", ""), + "meta": metadata_with_languages(feature_tree), "feature_tree": current_tree, "previous_feature_tree": previous_feature_tree, "iteration_logs": iteration_logs, @@ -2009,6 +2047,7 @@ def _load_feature_data(feature_build_path: Path, feature_spec_path: Path) -> Dic "repository_name": "", "repository_purpose": "", "repository_specification": "", + "meta": {}, "feature_tree": {}, } @@ -2026,6 +2065,14 @@ def _load_feature_data(feature_build_path: Path, feature_spec_path: Path) -> Dic f"Loaded repository_purpose from feature_spec.json ({len(spec_repo_purpose)} chars)" ) + feature_tree["meta"] = metadata_with_languages(feature_spec) + languages = _target_languages(feature_tree) + if languages: + logger.info( + "Loaded target languages from feature_spec.json: %s", + ", ".join(languages), + ) + spec_content = feature_spec_path.read_text(encoding="utf-8").strip() if spec_content: feature_tree["repository_specification"] = spec_content diff --git a/CoderMind/scripts/feature_construct.py b/CoderMind/scripts/feature_construct.py index 87b966a..b8e35c3 100644 --- a/CoderMind/scripts/feature_construct.py +++ b/CoderMind/scripts/feature_construct.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Phase 1 feature construction facade orchestrator.""" +"""Feature construction facade orchestrator.""" from __future__ import annotations @@ -14,6 +14,7 @@ from pathlib import Path from typing import Any, Optional +from common.language_meta import extract_language_metadata from common.paths import FEATURE_BUILD_FILE as _FEATURE_BUILD_FILE from common.paths import FEATURE_SPEC_FILE as _FEATURE_SPEC_FILE from common.paths import FEATURE_TREE_FILE as _FEATURE_TREE_FILE @@ -113,7 +114,71 @@ def _has_content(value: Any) -> bool: return True -def _state(stage: Stage, type_: str, message: str, raw: Optional[dict[str, Any]] = None) -> StageState: +def _language_fields(data: dict[str, Any]) -> tuple[Optional[str], list[str]]: + return extract_language_metadata(data) + + +def _language_raw(data: dict[str, Any]) -> dict[str, Any]: + primary, languages = _language_fields(data) + return { + "primary_language": primary, + "target_languages": languages, + } + + +def _expected_language_fields() -> tuple[Optional[str], list[str]]: + data, error = _load_json_object(FEATURE_SPEC_FILE) + if error or data is None: + return None, [] + return _language_fields(data) + + +def _language_errors( + logical: str, + data: dict[str, Any], + *, + expected: tuple[Optional[str], list[str]] | None = None, + required: bool = False, +) -> list[str]: + primary, languages = _language_fields(data) + errors: list[str] = [] + meta = data.get("meta") if isinstance(data.get("meta"), dict) else {} + raw_primary = meta.get("primary_language") + raw_languages = meta.get("target_languages") + has_primary = isinstance(raw_primary, str) and bool(raw_primary.strip()) + has_languages = isinstance(raw_languages, list) and any( + isinstance(item, str) and item.strip() for item in raw_languages + ) + if required and not has_primary: + errors.append(f"{logical} is missing meta.primary_language") + if required and not has_languages: + errors.append(f"{logical} is missing meta.target_languages") + if primary and languages and primary != languages[0]: + errors.append( + f"{logical} meta.primary_language does not match " + "meta.target_languages[0]" + ) + if expected: + expected_primary, expected_languages = expected + if expected_primary and primary != expected_primary: + errors.append( + f"{logical} meta.primary_language={primary!r}, " + f"expected {expected_primary!r}" + ) + if expected_languages and languages != expected_languages: + errors.append( + f"{logical} meta.target_languages={languages!r}, " + f"expected {expected_languages!r}" + ) + return errors + + +def _state( + stage: Stage, + type_: str, + message: str, + raw: Optional[dict[str, Any]] = None, +) -> StageState: return StageState( stage=stage, type=type_, @@ -131,7 +196,11 @@ def _check_feature_spec(stage: Stage) -> StageState: if error: return _state(stage, "warning", f"{logical} is not complete: {error}") - missing = [field for field in _REQUIRED_FEATURE_SPEC_FIELDS if not _has_content(data.get(field))] + missing = [ + field + for field in _REQUIRED_FEATURE_SPEC_FIELDS + if not _has_content(data.get(field)) + ] if missing: return _state( stage, @@ -139,7 +208,15 @@ def _check_feature_spec(stage: Stage) -> StageState: f"{logical} is missing required fields: {', '.join(missing)}", {"missing_fields": missing}, ) - return _state(stage, "update", f"{logical} is valid") + language_errors = _language_errors(logical, data, required=True) + if language_errors: + return _state( + stage, + "warning", + "; ".join(language_errors), + _language_raw(data), + ) + return _state(stage, "update", f"{logical} is valid", _language_raw(data)) def _check_feature_build(stage: Stage) -> StageState: @@ -149,7 +226,17 @@ def _check_feature_build(stage: Stage) -> StageState: return _state(stage, "init", f"{logical} is missing") if error: return _state(stage, "warning", f"{logical} is not complete: {error}") - return _state(stage, "update", f"{logical} is valid JSON", {"keys": sorted(data.keys())}) + expected = _expected_language_fields() + language_errors = _language_errors( + logical, + data, + expected=expected, + required=bool(expected[0] or expected[1]), + ) + raw = {"keys": sorted(data.keys()), **_language_raw(data)} + if language_errors: + return _state(stage, "warning", "; ".join(language_errors), raw) + return _state(stage, "update", f"{logical} is valid JSON", raw) def _check_feature_refactor(stage: Stage) -> StageState: @@ -162,7 +249,16 @@ def _check_feature_refactor(stage: Stage) -> StageState: components = data.get("components") if isinstance(components, (list, dict)) and components: - return _state(stage, "update", f"{logical} has components") + expected = _expected_language_fields() + language_errors = _language_errors( + logical, + data, + expected=expected, + required=bool(expected[0] or expected[1]), + ) + if language_errors: + return _state(stage, "warning", "; ".join(language_errors), _language_raw(data)) + return _state(stage, "update", f"{logical} has components", _language_raw(data)) return _state(stage, "warning", f"{logical} has no non-empty components collection") @@ -224,7 +320,7 @@ def _print_probe_summary(states: list[StageState]) -> None: if first_pending: print(f"Next pending stage: {first_pending}") else: - print("All Phase 1 stages are up-to-date.") + print("All feature construction stages are up-to-date.") print() print(_format_table(states)) @@ -246,6 +342,7 @@ def _check_only_payload(states: list[StageState]) -> dict[str, Any]: "done": state.done, "will_run": state.will_run, "reason": state.reason, + "details": state.raw, } for state in states ], @@ -326,11 +423,11 @@ def _reset_output_if_needed(state: StageState) -> None: def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: parser = argparse.ArgumentParser( prog="feature_construct.py", - description="Run the Phase 1 feature construction pipeline with automatic resume.", + description="Run the feature construction pipeline with automatic resume.", ) parser.add_argument("--check-only", action="store_true", help="Probe all stages and exit.") parser.add_argument("--json", action="store_true", help="With --check-only, emit JSON.") - parser.add_argument("--force", action="store_true", help="Rebuild all Phase 1 stages.") + parser.add_argument("--force", action="store_true", help="Rebuild all feature construction stages.") parser.add_argument("--dry-run", action="store_true", help="Print commands without executing them.") parser.add_argument("--verbose", action="store_true", help="Forward native verbose logging flags.") parser.add_argument("--no-trajectory", action="store_true", help="Disable trajectory recording where supported.") @@ -413,7 +510,7 @@ def main(argv: Optional[list[str]] = None) -> int: runnable = [state for state in states if state.will_run] if not runnable: - print("All Phase 1 stages are already complete — nothing to do.") + print("All feature construction stages are already complete — nothing to do.") print("Use `--force` to rebuild from scratch.") print("Next: `/cmind.plan` to build the Repository Planning Graph (RPG).") return 0 diff --git a/CoderMind/scripts/feature_edit.py b/CoderMind/scripts/feature_edit.py index 305b0ad..3bc3a65 100644 --- a/CoderMind/scripts/feature_edit.py +++ b/CoderMind/scripts/feature_edit.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 -"""Edit Feature Tree Script (Three-Phase Approach). +"""Edit Feature Tree Script. -Phase 1: Planning - Analyze all components and generate edit plan -Phase 2: Execution - Execute the plan precisely on each component -Phase 3: Review - Verify changes and auto-fix if needed (up to 3 rounds) +Workflow: +1. Planning - Analyze all components and generate an edit plan. +2. Execution - Apply the plan precisely to each component. +3. Review - Verify changes and auto-fix if needed (up to 3 rounds). Input/Output: .cmind/data/feature_tree.json """ @@ -253,7 +254,7 @@ class ComponentOperation(BaseModel): class EditPlan(BaseModel): - """Complete edit plan generated in Phase 1.""" + """Complete edit plan generated by the planning step.""" summary: str = Field(description="Overall summary of the edit plan") operations: List[ComponentOperation] = Field( @@ -264,7 +265,7 @@ class EditPlan(BaseModel): class ReviewResult(BaseModel): - """Review result generated in Phase 3.""" + """Review result generated by the review step.""" thinking: str = Field(description="Detailed thinking process of the review") summary: str = Field( @@ -293,12 +294,12 @@ class ReviewResult(BaseModel): # ============================================================================ -# Three-Phase Feature Tree Editor +# Feature Tree Editor # ============================================================================ class FeatureTreeEditor: - """Feature tree editor with three-phase approach: Planning + Execution + Review.""" + """Feature tree editor with planning, execution, and review steps.""" def __init__(self, llm_client: LLMClient, enable_review: bool = True): self.llm = llm_client @@ -321,18 +322,18 @@ def edit( repo_data: Dict[str, Any], model_analysis: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: - """Execute three-phase editing workflow: Plan -> Execute -> Review.""" + """Execute the planning, execution, and review workflow.""" # Capture state before edit self.state_before = self._capture_state(components) self.logger.info("=" * 70) - self.logger.info("PHASE 1: PLANNING") + self.logger.info("STEP: PLANNING") self.logger.info("=" * 70) # Build components summary for planning components_summary = self._build_components_summary(components) - # Phase 1: Generate edit plan + # Generate edit plan. plan = self._generate_edit_plan(components_summary, edit_instruction, repo_data) if plan is None: @@ -347,10 +348,10 @@ def edit( self._display_plan(plan) self.logger.info("\n" + "=" * 70) - self.logger.info("PHASE 2: EXECUTION") + self.logger.info("STEP: EXECUTION") self.logger.info("=" * 70) - # Phase 2: Execute the plan + # Execute the plan. execution_results = self._execute_plan(plan, components) # Capture state after edit @@ -367,7 +368,7 @@ def edit( "execution_results": execution_results, } - # Phase 3: Review with auto-fix loop (max 3 iterations) + # Review with auto-fix loop (max 3 iterations). if self.enable_review: MAX_REVIEW_ITERATIONS = 3 review_iterations = [] @@ -690,7 +691,7 @@ def _generate_edit_plan( user_instructions: str, repo_data: Dict[str, Any], ) -> Optional[EditPlan]: - """Phase 1: Generate edit plan using LLM.""" + """Generate an edit plan using the LLM.""" prompt = PROMPT_TEMPLATE_EDIT_PLAN.format( edit_instruction=user_instructions, repository_name=repo_data.get("repository_name", "Unknown"), @@ -742,7 +743,7 @@ def _display_plan(self, plan: EditPlan): def _execute_plan( self, plan: EditPlan, components: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - """Phase 2: Execute the edit plan.""" + """Execute the edit plan.""" # Build component lookup comp_by_name = {comp.get("name"): comp for comp in components} @@ -857,7 +858,7 @@ def _review_execution( model_analysis: Optional[Dict[str, Any]] = None, components: Optional[List[Dict[str, Any]]] = None, ) -> Optional[ReviewResult]: - """Phase 3: Review the execution results.""" + """Review the execution results.""" # Format plan operations for prompt plan_ops_str = "" for i, op in enumerate(plan.operations, 1): diff --git a/CoderMind/scripts/feature_refactor.py b/CoderMind/scripts/feature_refactor.py index 140b96c..41f92de 100644 --- a/CoderMind/scripts/feature_refactor.py +++ b/CoderMind/scripts/feature_refactor.py @@ -20,6 +20,7 @@ from common.paths import FEATURE_BUILD_FILE, FEATURE_TREE_FILE from common import print_unicode_table, get_all_leaf_paths, get_leaf_name, get_all_leaf_descriptions from common.llm_client import LLMClient +from common.language_meta import metadata_with_languages from common.trajectory import load_or_create_trajectory @@ -460,6 +461,7 @@ def refactor( "repository_specification": json.dumps( repo_data.get("repository_specification", {}), indent=2 ), + "meta": metadata_with_languages(repo_data), "features": feature_tree, "components": components, # "components_format": convert_component_to_features(components), @@ -1091,6 +1093,7 @@ def main(): "background_and_overview": background_and_overview, "functional_requirements": functional_requirements, "repository_specification": repo_specification_data, + "meta": metadata_with_languages(repo_specification_data), } # Create LLM client # Initialize trajectory diff --git a/CoderMind/scripts/feature_spec.py b/CoderMind/scripts/feature_spec.py index 82cc026..af5fa97 100644 --- a/CoderMind/scripts/feature_spec.py +++ b/CoderMind/scripts/feature_spec.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Phase-1 ``feature_spec`` stage — CLI wrapper. +"""CLI wrapper for the ``feature_spec`` stage. This is the entry point invoked by ``cmind script feature_spec.py`` and by the ``feature_construct`` orchestrator. All real work happens in diff --git a/CoderMind/scripts/func_design/__init__.py b/CoderMind/scripts/func_design/__init__.py index 9ab9948..ad16c07 100644 --- a/CoderMind/scripts/func_design/__init__.py +++ b/CoderMind/scripts/func_design/__init__.py @@ -26,7 +26,7 @@ from .base_class_agent import ( BaseClassAgent, LLMClient as BaseClassLLMClient, - extract_class_names, + extract_declaration_names, validate_base_classes, DataStructureDefinition, extract_data_flow_types, @@ -93,7 +93,7 @@ # Base Class "BaseClassAgent", "BaseClassLLMClient", - "extract_class_names", + "extract_declaration_names", "validate_base_classes", "DataStructureDefinition", "extract_data_flow_types", diff --git a/CoderMind/scripts/func_design/base_class_agent.py b/CoderMind/scripts/func_design/base_class_agent.py index e49ebd3..23f9764 100644 --- a/CoderMind/scripts/func_design/base_class_agent.py +++ b/CoderMind/scripts/func_design/base_class_agent.py @@ -11,7 +11,6 @@ import json import logging -import ast from typing import Dict, List, Optional, Tuple, Any from pydantic import BaseModel, Field, field_validator @@ -24,11 +23,10 @@ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) -from common import ( - LLMClient, - validate_python_syntax, - extract_class_names, -) +from common import LLMClient +from decoder_lang import get_backend +from decoder_lang.backend import LanguageBackend +from decoder_lang.prompt_directive import with_language_directive # ============================================================================ @@ -38,7 +36,7 @@ class BaseClassDefinition(BaseModel): """Definition of a base class or data structure.""" file_path: str = Field(..., description="Path where this base class should be placed") - code: str = Field(..., description="Full Python code for the class") + code: str = Field(..., description="Full target-language code for the definition") scope: str = Field(..., description="Scope: 'global' or a specific subtree/component name") subclasses: Dict[str, List[str]] = Field(..., description="Mapping from base class name to list of concrete subclass names (each list must have at least 2 items)") @@ -63,7 +61,7 @@ class DataStructureDefinition(BaseModel): Note: file_path is NOT assigned here. It will be assigned later by the interface designer and written back to base_classes.json. """ - code: str = Field(..., description="Python stub code (dataclass skeleton with fields and type annotations)") + code: str = Field(..., description="Target-language data structure stub code") subtree: str = Field(..., description="The functional area / subtree this data structure belongs to (must be a valid subtree name, NOT 'global')") data_flow_types: List[str] = Field(..., min_length=1, description="Which data_flow data_type names this definition covers") file_path: str = Field(default="", description="File path assigned later by the interface designer. Leave empty during base class design.") @@ -81,9 +79,10 @@ class BaseClassOutput(BaseModel): def validate_base_classes_model( model: "BaseClassOutput", - valid_subtrees: Optional[List[str]] = None + valid_subtrees: Optional[List[str]] = None, + backend: Optional[LanguageBackend] = None, ) -> Tuple[bool, str]: - """Validate base class definitions from Pydantic model: 1. Code has valid Python syntax 2. Scope is either 'global' or an exact match to a valid subtree name. + """Validate base class definitions from a Pydantic model. Args: model: BaseClassOutput Pydantic model @@ -91,6 +90,7 @@ def validate_base_classes_model( Returns: (is_valid, error_message) """ + backend = backend or get_backend("python") # Build set of valid scope values valid_scopes = {"global"} if valid_subtrees: @@ -108,8 +108,7 @@ def validate_base_classes_model( ) continue - # Validate Python syntax - is_valid, error_msg = validate_python_syntax(bc.code) + is_valid, error_msg = backend.syntax_check(bc.code, bc.file_path) if not is_valid: errors.append(f"Base class {i} ({bc.file_path}): syntax error - {error_msg}") @@ -121,9 +120,10 @@ def validate_base_classes_model( def validate_base_classes( base_classes: List[Dict[str, Any]], - valid_subtrees: Optional[List[str]] = None + valid_subtrees: Optional[List[str]] = None, + backend: Optional[LanguageBackend] = None, ) -> Tuple[bool, str]: - """Validate base class definitions: 1. Each has file_path, code, and scope 2. Code has valid Python syntax 3. Scope is either 'global' or an exact match to a valid subtree name. + """Validate base class definitions. Args: base_classes: List of base class definitions @@ -131,6 +131,7 @@ def validate_base_classes( Returns: (is_valid, error_message) """ + backend = backend or get_backend("python") if not base_classes: return False, "Empty base classes provided" @@ -186,7 +187,7 @@ def validate_base_classes( ) continue - is_valid, error_msg = validate_python_syntax(code) + is_valid, error_msg = backend.syntax_check(code, file_path) if not is_valid: errors.append(f"Base class {i} ({file_path}): syntax error - {error_msg}") @@ -217,12 +218,28 @@ def extract_data_flow_types(data_flow: List[Dict[str, Any]]) -> List[str]: return sorted(types) +def extract_declaration_names(code: str, backend: LanguageBackend) -> List[str]: + """Extract top-level type or function names from target-language code.""" + names: List[str] = [] + for unit in backend.list_code_units(code): + if getattr(unit, "parent", None) is not None: + continue + if getattr(unit, "unit_type", "") in { + "class", "struct", "interface", "function", + }: + name = getattr(unit, "name", "") + if name and name not in names: + names.append(name) + return names + + def validate_data_structures( data_structures: List[Dict[str, Any]], data_flow_types: List[str], - valid_subtrees: Optional[List[str]] = None + valid_subtrees: Optional[List[str]] = None, + backend: Optional[LanguageBackend] = None, ) -> Tuple[bool, str]: - """Validate data structure definitions: 1. Each has code, subtree, and data_flow_types 2. Code has valid Python syntax 3. Subtree is a valid subtree name (NOT 'global'). + """Validate data structure definitions. Note: file_path is NOT validated here — it is assigned later by the interface designer. @@ -234,6 +251,7 @@ def validate_data_structures( Returns: (is_valid, error_message) """ + backend = backend or get_backend("python") # Build set of valid subtree values (no 'global' for data structures) valid_subtree_set = set() if valid_subtrees: @@ -275,8 +293,7 @@ def validate_data_structures( errors.append(f"Data structure {i}: data_flow_types must not be empty") continue - # Validate Python syntax - is_valid, error_msg = validate_python_syntax(code) + is_valid, error_msg = backend.syntax_check(code, f"data_structure{backend.file_extension}") if not is_valid: errors.append(f"Data structure {i} (subtree={subtree}): syntax error - {error_msg}") @@ -301,7 +318,8 @@ def __init__( max_iterations: int = 5, logger: Optional[logging.Logger] = None, trajectory: Optional[Any] = None, - step_id: Optional[int] = None + step_id: Optional[int] = None, + target_language: Optional[str] = None, ): # Create LLMClient with trajectory support if not provided if llm_client is None: @@ -313,6 +331,7 @@ def __init__( self.llm.set_trajectory(trajectory, step_id) self.max_iterations = max_iterations self.logger = logger or logging.getLogger(__name__) + self.backend = get_backend(target_language) def design_base_classes( self, @@ -343,7 +362,7 @@ def design_base_classes( self.logger.info(f"[BaseClassAgent] Designing base classes for {repo_name}") # Build system prompt (tool description is now integrated) - system_prompt = BASE_CLASS_PROMPT + system_prompt = with_language_directive(BASE_CLASS_PROMPT, self.backend) # Extract unique data_type values from data flow (for post-validation) data_flow_type_names = extract_data_flow_types(data_flow) @@ -368,9 +387,11 @@ def design_base_classes( design base classes that are idiomatic for those technologies rather than purely abstract. For example, if the project uses Flask, prefer Flask Blueprint patterns over generic abstract request handlers. If no specific technology is mentioned, -use abstract base classes (ABC). + use the target language's idiomatic abstraction mechanism. """ + hints = self.backend.prompt_hints() + user_prompt = f"""Based on the repository structure and data flow, generate base class definitions: Repository Name: {repo_name} Repository Info: {repo_info} @@ -392,12 +413,12 @@ def design_base_classes( 1. Shared behavioral abstractions (base classes with abstract methods) 2. Common data structures that flow between components 3. Keep it minimal - only create abstractions that will be reused by multiple components -4. Use dataclasses for data structures, ABC for behavioral abstractions +4. Use idiomatic {hints.display_name} constructs for data structures and behavioral abstractions Additionally, for data_structures: - Data flow types that are generic enough to serve as base classes (with subclasses) should go into base_classes, not data_structures - The remaining data flow types that are NOT absorbed by base classes should be defined as data_structures -- Use @dataclass with explicit fields, type annotations, and docstrings +- Use idiomatic {hints.display_name} data containers with explicit fields and documentation - These are stubs (skeleton code) — they will be fully implemented later - Each data structure must belong to a specific subtree (not global) - Do NOT specify file_path — it will be assigned by the interface designer later""" @@ -431,7 +452,11 @@ def design_base_classes( data_structures = [ds.model_dump() for ds in result_model.data_structures] # Custom validation (scope and syntax) for base classes - is_valid, error_msg = validate_base_classes_model(result_model, valid_subtrees=functional_areas) + is_valid, error_msg = validate_base_classes_model( + result_model, + valid_subtrees=functional_areas, + backend=self.backend, + ) if not is_valid: self.logger.warning(f"[BaseClassAgent] Base class validation failed: {error_msg}") @@ -440,7 +465,10 @@ def design_base_classes( # Validate data structures ds_valid, ds_error = validate_data_structures( - data_structures, data_flow_type_names, valid_subtrees=functional_areas + data_structures, + data_flow_type_names, + valid_subtrees=functional_areas, + backend=self.backend, ) if not ds_valid: @@ -451,14 +479,12 @@ def design_base_classes( # Extract class names for logging all_classes = [] for bc in base_classes: - class_names = extract_class_names(bc.get("code", "")) - all_classes.extend(class_names) + all_classes.extend(extract_declaration_names(bc.get("code", ""), self.backend)) # Extract data structure class names ds_class_names = [] for ds in data_structures: - class_names = extract_class_names(ds.get("code", "")) - ds_class_names.extend(class_names) + ds_class_names.extend(extract_declaration_names(ds.get("code", ""), self.backend)) # Check data_flow_type coverage (base_classes code may also cover some types) bc_class_set = set(all_classes) diff --git a/CoderMind/scripts/func_design/base_class_prompts.py b/CoderMind/scripts/func_design/base_class_prompts.py index b8fb4b1..852486d 100644 --- a/CoderMind/scripts/func_design/base_class_prompts.py +++ b/CoderMind/scripts/func_design/base_class_prompts.py @@ -9,7 +9,7 @@ # ============================================================================ BASE_CLASS_PROMPT = """ -You are an expert software engineer designing reusable abstractions and shared data structures for a Python codebase. +You are an expert software engineer designing reusable abstractions and shared data structures for a target-language codebase. Your objective is to introduce only the minimum necessary set of well-justified base classes and shared data structures — enough to improve modularity and consistency, but not so many that the system becomes rigid or over-engineered. @@ -42,8 +42,8 @@ Provide standardized data containers that flow across subtrees and pipeline components. Requirements: -- Should be fully implemented (for example, dataclasses). -- Must use explicit fields with type annotations and meaningful docstrings. +- Should be fully implemented using idiomatic target-language constructs. +- Must use explicit fields/types and meaningful documentation. - Represent real semantic units, not generic catch-all containers. Design Guidelines: @@ -56,7 +56,7 @@ Some `data_type` labels from the data flow graph may be generic enough to be modeled as base classes (with subclasses). Those should go into `base_classes` above. The **remaining** data flow types — those that are concrete, self-contained data containers — should be defined here as data structure stubs. These stubs ensure design continuity and will be fully implemented during later code generation batches. Requirements: -- Should be `@dataclass` stubs with explicit fields, type annotations, and docstrings. +- Should be target-language data container stubs with explicit fields and documentation. - Fields should be inferred from the data flow context (source, target, transformation descriptions). - Mark fields with reasonable defaults or `None` where the full implementation is not yet known. - These are **stubs** — they will be fully implemented later. Keep them minimal but structurally correct. @@ -88,18 +88,18 @@ {{ "base_classes": [ {{ - "file_path": "Path to the Python file where the base class code should live (string).", - "code": "Full Python source code for that file, including base class definitions (string).", + "file_path": "Path to the target-language source file where the abstraction code should live (string).", + "code": "Full target-language source code for that file, including abstraction definitions (string).", "scope": "'global' for repository-wide (L0) base class, or a specific subtree/functional area name (**NOT** directory name) for module-level (L1) base class (string, required).", "subclasses": "Mapping from each base class name to its concrete subclass names (object, required). Example: {\"BaseNode\": [\"ItemNode\", \"FunctionNode\"], \"BaseConfig\": [\"RunConfig\", \"TestConfig\"]}. Each base class must have at least 2 subclasses." }} ], "data_structures": [ {{ - "code": "Python stub code: @dataclass skeleton with fields, type annotations, and docstrings (string).", + "code": "Target-language data structure stub code with fields and documentation (string).", "subtree": "The functional area / subtree name this data structure belongs to (string, required). Must be one of the Functional Areas listed in the prompt. Do NOT use 'global'.", "data_flow_types": "List of data_type names from the data flow that this definition covers (list of strings, required, at least 1). Example: [\"ParsedExpression\", \"TokenList\"]", - "file_path": "Path to the Python file where this data structure stub should live (string, optional). If not provided, the interface designer will assign it during integration." + "file_path": "Path to the target-language source file where this data structure stub should live (string, optional). If not provided, the interface designer will assign it during integration." }} ] }} @@ -112,7 +112,7 @@ """ BASE_CLASS_REVIEW_PROMPT = """ -You are a senior software architect reviewing a set of functional base classes and global shared data structures for a Python repo. +You are a senior software architect reviewing a set of functional base classes and global shared data structures for a target-language repo. These abstractions are foundational contracts for future modules and subtrees. Core constraint: diff --git a/CoderMind/scripts/func_design/data_flow_agent.py b/CoderMind/scripts/func_design/data_flow_agent.py index 8c45973..5b38b2f 100644 --- a/CoderMind/scripts/func_design/data_flow_agent.py +++ b/CoderMind/scripts/func_design/data_flow_agent.py @@ -26,6 +26,8 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from common import LLMClient +from decoder_lang import get_backend +from decoder_lang.prompt_directive import with_language_directive # ============================================================================ @@ -194,7 +196,8 @@ def __init__( max_iterations: int = 5, logger: Optional[logging.Logger] = None, trajectory: Optional[Any] = None, - step_id: Optional[int] = None + step_id: Optional[int] = None, + target_language: Optional[str] = None, ): # Create LLMClient with trajectory support if not provided if llm_client is None: @@ -206,6 +209,7 @@ def __init__( self.llm.set_trajectory(trajectory, step_id) self.max_iterations = max_iterations self.logger = logger or logging.getLogger(__name__) + self.backend = get_backend(target_language) def build_data_flow( self, @@ -233,7 +237,7 @@ def build_data_flow( self.logger.info(f"[DataFlowAgent] Building data flow for {len(functional_areas)} components") # Build system prompt (tool description is now integrated) - system_prompt = DATA_FLOW_PROMPT + system_prompt = with_language_directive(DATA_FLOW_PROMPT, self.backend) # Build user prompt areas_str = format_functional_areas(functional_areas, component_dirs) diff --git a/CoderMind/scripts/func_design/data_flow_prompts.py b/CoderMind/scripts/func_design/data_flow_prompts.py index 3c9b926..55e6c29 100644 --- a/CoderMind/scripts/func_design/data_flow_prompts.py +++ b/CoderMind/scripts/func_design/data_flow_prompts.py @@ -9,7 +9,7 @@ # ============================================================================ DATA_FLOW_PROMPT = """ -You are a system architect designing the **inter-subtree data flow** for a Python repository. +You are a system architect designing the **inter-subtree data flow** for a software repository. Your goal is to describe **how data moves** between functional subtrees as a **directed acyclic graph (DAG)** of edges. Each edge represents one data object passed from one subtree to another. diff --git a/CoderMind/scripts/func_design/func_designer.py b/CoderMind/scripts/func_design/func_designer.py index 18fd5e5..c3d1499 100644 --- a/CoderMind/scripts/func_design/func_designer.py +++ b/CoderMind/scripts/func_design/func_designer.py @@ -319,21 +319,21 @@ def run_full_pipeline(self) -> Dict[str, Any]: "success": True } - # Phase 1: Data Flow + # Data-flow design step. data_flow_result = self.run_data_flow_phase() results["data_flow_phase"] = data_flow_result if not data_flow_result.get("success", False): self.logger.warning("Data flow phase had issues, continuing...") - # Phase 2: Base Classes + # Base-class design step. base_class_result = self.run_base_class_phase() results["base_classes_phase"] = base_class_result if not base_class_result.get("success", False): self.logger.warning("Base class phase had issues, continuing...") - # Phase 3: Interfaces + # Interface design step. interface_result = self.run_interface_phase() results["interfaces_phase"] = interface_result diff --git a/CoderMind/scripts/func_design/interface_agent.py b/CoderMind/scripts/func_design/interface_agent.py index 1da782d..d08e28c 100644 --- a/CoderMind/scripts/func_design/interface_agent.py +++ b/CoderMind/scripts/func_design/interface_agent.py @@ -16,7 +16,7 @@ import re from typing import Dict, List, Optional, Tuple, Any, Set from collections import defaultdict, deque -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator # Import ParsedFile and CodeUnit for code parsing import sys @@ -24,6 +24,14 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from rpg.code_unit import ParsedFile, CodeUnit +# AST inspection routes through the decoder language backend so +# code-structure extraction can vary by target language. The direct +# ``ast`` import supports Python-specific node inspection for docstrings +# and annotation syntax below. +from decoder_lang import get_backend +from decoder_lang.backend import LanguageBackend +from decoder_lang.prompt_directive import with_language_directive + # Import common LLMClient with trajectory support from common import ( LLMClient, @@ -57,6 +65,21 @@ class InterfaceDefinition(BaseModel): code: str = Field(..., description="Python code for the interface") dependencies: Optional[InterfaceDependency] = Field(default=None, description="Declared dependencies") + @model_validator(mode="before") + @classmethod + def _normalise_aliases(cls, value: Any) -> Any: + if not isinstance(value, dict): + return value + normalised = dict(value) + if "features" not in normalised: + if "feature_paths" in normalised: + normalised["features"] = normalised["feature_paths"] + elif "feature_path" in normalised: + normalised["features"] = [normalised["feature_path"]] + if "dependencies" not in normalised and "dependency" in normalised: + normalised["dependencies"] = normalised["dependency"] + return normalised + class InterfaceOutput(BaseModel): """Output from LLM for interface design.""" @@ -68,6 +91,28 @@ class FileInterfaceBlock(BaseModel): file_path: str = Field(..., description="Path to the file being designed") interfaces: List[InterfaceDefinition] = Field(..., min_length=1, description="Interface definitions for this file") + @model_validator(mode="before") + @classmethod + def _normalise_aliases(cls, value: Any) -> Any: + if not isinstance(value, dict): + return value + normalised = dict(value) + if "file_path" not in normalised and "path" in normalised: + normalised["file_path"] = normalised["path"] + if "interfaces" not in normalised: + for alias in ("interface_definitions", "interface_units", "units"): + if alias in normalised: + normalised["interfaces"] = normalised[alias] + break + if "interfaces" not in normalised and "code" in normalised: + interface_data = { + key: normalised[key] + for key in ("features", "feature_paths", "feature_path", "code", "dependencies") + if key in normalised + } + normalised["interfaces"] = [interface_data] + return normalised + class SubtreeInterfaceOutput(BaseModel): """Output from LLM for subtree-level interface design (all files at once).""" @@ -111,7 +156,12 @@ class DependencyCollector: 2. LLM declarations - expected function calls declared by LLM """ - def __init__(self, known_base_classes: Set[str], known_types: Set[str]): + def __init__( + self, + known_base_classes: Set[str], + known_types: Set[str], + target_language: Optional[str] = None, + ): """Initialize the dependency collector. Args: @@ -120,6 +170,7 @@ def __init__(self, known_base_classes: Set[str], known_types: Set[str]): """ self.known_base_classes = known_base_classes self.known_types = known_types + self.backend = get_backend(target_language) self.original_edges: List[Dict[str, Any]] = [] self.inheritance_edges: List[Dict[str, Any]] = [] self.invocation_edges: List[Dict[str, Any]] = [] @@ -226,50 +277,62 @@ def analyze_code_dependencies( file_path: str, base_class_files: Dict[str, str] ): - """Analyze code to extract dependencies via AST parsing. - - Extracts: - - Inheritance relationships (class X(BaseClass)) - - Type references in annotations - + """Extract code-level dependency edges from interface code. + + Inheritance is resolved uniformly across languages through the + backend's :meth:`list_inheritance` (Python derives it from class + bases; tree-sitter backends emit ``inherits`` edges). Type + references from annotations are a Python-specific enrichment; + other languages supply equivalent ``uses_types`` information via + the LLM-declared dependencies (see :meth:`process_llm_dependencies`). + Args: - code: Python source code to analyze - file_path: Path of the file containing this code - base_class_files: Mapping of class names to their file paths + code: Interface source code to analyze. + file_path: Path of the file containing this code. + base_class_files: Mapping of class/type names to file paths. """ - try: - tree = ast.parse(code) - except SyntaxError: - return - - for node in ast.walk(tree): - # Extract inheritance - if isinstance(node, ast.ClassDef): - child_class = node.name - for base in node.bases: - parent_name = _extract_name_from_node(base) - if parent_name and parent_name in self.known_base_classes: - parent_file = base_class_files.get(parent_name) - self.add_inheritance(child_class, parent_name, file_path, parent_file) - - # Extract type references from function annotations - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - func_name = node.name - # Check parameter types - for arg in node.args.args: - if arg.annotation: - types = _extract_type_names(arg.annotation) - for t in types: - if t in self.known_types: - type_file = base_class_files.get(t) - self.add_reference(f"function {func_name}", t, file_path, type_file) - # Check return type - if node.returns: - types = _extract_type_names(node.returns) - for t in types: + # Inheritance — language-agnostic via the backend. + for dep in self.backend.list_inheritance(code, file_path): + child = dep.src + parent = dep.symbol or dep.dst + if child and parent and parent in self.known_base_classes: + parent_file = base_class_files.get(parent) + self.add_inheritance(child, parent, file_path, parent_file) + + # Type references from annotations — Python-specific rich + # extraction. Other languages cover this via LLM ``uses_types``. + if self.backend.name == "python": + self._analyze_python_type_references(code, file_path, base_class_files) + + def _analyze_python_type_references( + self, + code: str, + file_path: str, + base_class_files: Dict[str, str] + ): + """Add reference edges for Python parameter/return type annotations.""" + for unit in self.backend.list_code_units(code, file_path): + if unit.unit_type not in ("function", "method"): + continue + node = (unit.extra or {}).get("ast_node") + if node is None: + continue + func_name = unit.name + for arg in getattr(node.args, "args", []): + if arg.annotation is not None: + for t in _extract_type_names(arg.annotation): if t in self.known_types: type_file = base_class_files.get(t) - self.add_reference(f"function {func_name}", t, file_path, type_file) + self.add_reference( + f"function {func_name}", t, file_path, type_file, + ) + if getattr(node, "returns", None) is not None: + for t in _extract_type_names(node.returns): + if t in self.known_types: + type_file = base_class_files.get(t) + self.add_reference( + f"function {func_name}", t, file_path, type_file, + ) def process_llm_dependencies( self, @@ -494,7 +557,10 @@ class GlobalInterfaceRegistry: enabling accurate cross-subtree dependency edges. """ - def __init__(self): + def __init__(self, backend: Optional[LanguageBackend] = None): + # Target-language backend for declaration/signature parsing. + # Defaults to Python so standalone/legacy callers keep working. + self.backend = backend or get_backend("python") # unit_name -> {file_path, subtree_name, unit_type, signature_summary, features} self.units: Dict[str, Dict[str, Any]] = {} # class_name -> file_path (for quick lookup) @@ -568,7 +634,9 @@ def register_from_subtree_result( bare_name = unit_name # Extract a signature summary from the code (first non-import, non-blank line) - signature_summary = self._extract_signature_summary(code, unit_type, bare_name) + signature_summary = self._extract_signature_summary( + code, unit_type, bare_name, self.backend + ) unit_info = { "file_path": file_path, @@ -744,67 +812,59 @@ def get_all_structured_listings_for_upstream( return "\n\n".join(listings) @staticmethod - def _extract_signature_summary(code: str, unit_type: str, bare_name: str) -> str: - """Extract a concise signature summary from interface code.""" + def _extract_signature_summary( + code: str, unit_type: str, bare_name: str, backend: LanguageBackend + ) -> str: + """Extract a concise signature summary from interface code. + + Declaration discovery routes through ``backend.list_code_units`` + and ``format_signature``. For Python, class summaries additionally + read direct base-class names from the preserved ``ClassDef`` in + ``unit.extra['ast_node']``; other backends omit bases gracefully. + """ if not code: return bare_name - - try: - tree = ast.parse(code) - for node in ast.iter_child_nodes(tree): - if unit_type == "class" and isinstance(node, ast.ClassDef) and node.name == bare_name: - # For classes, list public methods with signatures - methods = [] - for item in node.body: - if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)): - if not item.name.startswith("_") or item.name == "__init__": - sig = GlobalInterfaceRegistry._format_func_signature(item) - methods.append(sig) - bases_str = "" - if node.bases: - bases = [_extract_name_from_node(b) for b in node.bases] - bases = [b for b in bases if b] - if bases: - bases_str = f"({', '.join(bases)})" - if methods: - return f"{bare_name}{bases_str} [{', '.join(methods[:5])}]" - return f"{bare_name}{bases_str}" - - elif unit_type == "function" and isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == bare_name: - return GlobalInterfaceRegistry._format_func_signature(node) - except SyntaxError: - pass - - return bare_name - - @staticmethod - def _format_func_signature(node) -> str: - """Format a function/method AST node into a concise signature string.""" - name = node.name - params = [] - for arg in node.args.args: - if arg.arg == "self": - continue - param_str = arg.arg - if arg.annotation: - type_str = ast.unparse(arg.annotation) if hasattr(ast, 'unparse') else "" - if type_str: - param_str = f"{arg.arg}: {type_str}" - params.append(param_str) - - ret_str = "" - if node.returns: - ret_type = ast.unparse(node.returns) if hasattr(ast, 'unparse') else "" - if ret_type: - ret_str = f" -> {ret_type}" - - # Truncate params if too many - if len(params) > 4: - params_str = ", ".join(params[:3]) + ", ..." - else: - params_str = ", ".join(params) - - return f"{name}({params_str}){ret_str}" + + units = backend.list_code_units(code, "") + if not units: + return bare_name + + # Find the matching top-level declaration. + target = next( + (u for u in units + if u.unit_type == unit_type and u.name == bare_name and u.parent is None), + None, + ) + if target is None: + return bare_name + + if unit_type == "function": + return backend.format_signature(target) + + # Class case: collect direct-child methods + format bases. + # ``backend.list_code_units`` walks BFS so methods of this + # class are those whose ``parent`` matches ``bare_name``; + # source order is preserved within a single parent. + method_units = [ + u for u in units + if u.unit_type == "method" and u.parent == bare_name + ] + methods: List[str] = [] + for m in method_units: + if not m.name.startswith("_") or m.name == "__init__": + methods.append(backend.format_signature(m)) + + bases_str = "" + class_node = (target.extra or {}).get("ast_node") + if class_node is not None and getattr(class_node, "bases", None): + base_names = [_extract_name_from_node(b) for b in class_node.bases] + base_names = [b for b in base_names if b] + if base_names: + bases_str = f"({', '.join(base_names)})" + + if methods: + return f"{bare_name}{bases_str} [{', '.join(methods[:5])}]" + return f"{bare_name}{bases_str}" # ============================================================================ @@ -815,7 +875,8 @@ def cross_validate_imports_vs_calls( code: str, file_path: str, declared_calls: List[str], - global_registry: GlobalInterfaceRegistry + global_registry: GlobalInterfaceRegistry, + backend: LanguageBackend, ) -> List[Dict[str, str]]: """Parse import statements in interface code and cross-validate against declared calls. Identifies symbols that are imported from modules in the global registry but not declared as call dependencies. @@ -832,50 +893,52 @@ def cross_validate_imports_vs_calls( """ warnings = [] declared_set = set(declared_calls) - - try: - tree = ast.parse(code) - except SyntaxError: - return warnings - - for node in ast.walk(tree): - if isinstance(node, ast.ImportFrom): - module = node.module or "" - for alias in node.names: - symbol = alias.name - # Check if this symbol is in the global registry - resolved_file = global_registry.resolve_callee(symbol) - if resolved_file and resolved_file != file_path: - # Symbol is a known interface from another file - if symbol not in declared_set: - warnings.append({ - "imported_symbol": symbol, - "imported_from": module, - "resolved_file": resolved_file, - "file_path": file_path, - "message": ( - f"'{symbol}' is imported from '{module}' and is a known " - f"interface in '{resolved_file}', but not declared in " - f"dependencies.calls" - ) - }) - elif isinstance(node, ast.Import): - for alias in node.names: - symbol = alias.name.split(".")[-1] if "." in alias.name else alias.name - resolved_file = global_registry.resolve_callee(symbol) - if resolved_file and resolved_file != file_path: - if symbol not in declared_set: - warnings.append({ - "imported_symbol": symbol, - "imported_from": alias.name, - "resolved_file": resolved_file, - "file_path": file_path, - "message": ( - f"'{symbol}' is imported and is a known interface in " - f"'{resolved_file}', but not declared in dependencies.calls" - ) - }) - + + # Import discovery routes through the target language backend. + # ``list_imports`` returns one LPDependency per imported symbol; + # ``extra["module"]`` holds the source module and ``extra["imported"]`` + # is present for ``from X import Y`` statements. Backends whose imports + # do not populate these fields simply yield no warnings. + for dep in backend.list_imports(code, file_path): + extra = dep.extra or {} + module = extra.get("module") or "" + + if "imported" in extra: + # ``from import `` — symbol is the + # imported name. Aliases do not affect registry lookup. + symbol = extra.get("imported") or "" + imported_from = module + message_suffix = ( + f"'{symbol}' is imported from '{module}' and is a known " + f"interface in '{{resolved_file}}', but not declared in " + f"dependencies.calls" + ) + else: + # ``import `` — symbol is the last dotted segment + # of the module path used for registry lookup. + full_name = module + symbol = full_name.rsplit(".", 1)[-1] if "." in full_name else full_name + imported_from = full_name + message_suffix = ( + f"'{symbol}' is imported and is a known interface in " + f"'{{resolved_file}}', but not declared in dependencies.calls" + ) + + if not symbol: + continue + resolved_file = global_registry.resolve_callee(symbol) + if not (resolved_file and resolved_file != file_path): + continue + if symbol in declared_set: + continue + warnings.append({ + "imported_symbol": symbol, + "imported_from": imported_from, + "resolved_file": resolved_file, + "file_path": file_path, + "message": message_suffix.format(resolved_file=resolved_file), + }) + return warnings @@ -883,118 +946,120 @@ def cross_validate_imports_vs_calls( # Validation Functions # ============================================================================ -def extract_top_level_definitions(code: str) -> Tuple[List[str], List[str]]: - """Extract top-level function and class names from code.""" - functions = [] - classes = [] - try: - tree = ast.parse(code) - for node in ast.iter_child_nodes(tree): - if isinstance(node, ast.FunctionDef): - functions.append(node.name) - elif isinstance(node, ast.AsyncFunctionDef): - functions.append(node.name) - elif isinstance(node, ast.ClassDef): - classes.append(node.name) - except SyntaxError: - pass - return functions, classes - - -def check_has_docstring(code: str) -> Tuple[bool, str]: - """Check if top-level functions/classes have docstrings.""" - errors = [] - try: - tree = ast.parse(code) - for node in ast.iter_child_nodes(tree): - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): - if not ast.get_docstring(node): - errors.append(f"{type(node).__name__} '{node.name}' is missing a docstring") - except SyntaxError: - pass - - if errors: - return False, "; ".join(errors) - return True, "" +def _unit_has_docstring(unit: Any) -> bool: + """Return whether a parsed Python unit has a docstring.""" + docstring = getattr(unit, "docstring", None) + if docstring: + return True + node = (getattr(unit, "extra", {}) or {}).get("ast_node") + if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + return bool(ast.get_docstring(node)) + return False + + +def _strip_markdown_code_fence(code: str) -> str: + """Remove a full Markdown code fence around an interface snippet.""" + match = re.fullmatch(r"\s*```[A-Za-z0-9_+-]*\s*\n(.*?)\n```\s*", code, re.DOTALL) + return f"{match.group(1)}\n" if match else code def validate_interface( interface: Dict[str, Any], target_features: Set[str], - covered_features: Set[str] + covered_features: Set[str], + backend: Optional[LanguageBackend] = None, ) -> Tuple[bool, str, Dict[str, Any]]: """Validate a single interface definition using ParsedFile. Returns: (is_valid, error_message, parsed_info) """ - features = interface.get("features", []) - code = interface.get("code", "") + backend = backend or get_backend("python") + raw_features = interface.get("features", []) + features = list(raw_features) if isinstance(raw_features, list) else [] + code = _strip_markdown_code_fence(interface.get("code", "")) + interface["code"] = code errors = [] + + if target_features: + invalid_features = sorted(set(features) - target_features) + duplicate_features = sorted(set(features) & covered_features) + filtered_features = [ + feature for feature in features + if feature in target_features and feature not in covered_features + ] + if invalid_features or duplicate_features: + warnings = interface.setdefault("_validation_warnings", []) + if invalid_features: + warnings.append( + "Ignored feature paths outside this file's target set: " + + ", ".join(invalid_features) + ) + if duplicate_features: + warnings.append( + "Ignored feature paths already covered by earlier interfaces: " + + ", ".join(duplicate_features) + ) + features = filtered_features + interface["features"] = features # Check features if not features: - errors.append("Interface must have at least one feature") - else: - feature_set = set(features) - - # Check for overlap with already covered features - overlap = feature_set & covered_features - if overlap: - errors.append(f"Features {list(overlap)} are already covered by another interface") - - # Check if features are in target features - if target_features: - invalid_features = feature_set - target_features - if invalid_features: - errors.append(f"Features {list(invalid_features)} are not in target features") - - # Auto-fix hyphenated module names in import statements - # (e.g., "from blog-system.security import ..." -> "from blog_system.security import ...") - code = re.sub( - r'^(\s*(?:from|import)\s+)([\w\-]+(?:\.[\w\-]+)*)', - lambda m: m.group(1) + m.group(2).replace('-', '_'), - code, - flags=re.MULTILINE, - ) - # Persist the fixed code back so downstream consumers get corrected imports - interface["code"] = code + errors.append("Interface must cover at least one uncovered target feature") + + if backend.name == "python": + code = re.sub( + r'^(\s*(?:from|import)\s+)([\w\-]+(?:\.[\w\-]+)*)', + lambda m: m.group(1) + m.group(2).replace('-', '_'), + code, + flags=re.MULTILINE, + ) + interface["code"] = code - # Parse code with ParsedFile - parsed_file = ParsedFile(code=code, file_path="temp_interface.py") - - # Check for syntax errors - if parsed_file.has_error(): - error = parsed_file.error - errors.append(f"Syntax error: line {error.lineno}, column {error.offset}: {error.msg}") + ok, syntax_error = backend.syntax_check(code, f"temp_interface{backend.file_extension}") + if not ok: + errors.append(f"Syntax error: {syntax_error}") return False, "; ".join(errors), {} - - # Extract only class and function units (not methods) + interface_units = [ - unit for unit in parsed_file.units - if unit.unit_type in ["function", "class"] + unit for unit in backend.list_code_units(code, f"temp_interface{backend.file_extension}") + if unit.unit_type in [ + "function", + "class", + "struct", + "interface", + "method", + "type", + "enum", + ] + and (unit.parent is None or unit.unit_type == "method") ] - + if not interface_units: - errors.append("No valid functions/classes found in code") - - # Check docstrings - for unit in interface_units: - if not unit.docstring and unit.unit_type in ["function", "class"]: - errors.append( - f"Missing docstring for {unit.unit_type} '{unit.name}' " - f"in features {features}" - ) + errors.append("No valid target-language declarations found in code") + + if backend.name == "python": + for unit in interface_units: + if not _unit_has_docstring(unit) and unit.unit_type in ["function", "class"]: + errors.append( + f"Missing docstring for {unit.unit_type} '{unit.name}' " + f"in features {features}" + ) if errors: return False, "; ".join(errors), {} # Build parsed info with CodeUnit objects - functions = [u.name for u in interface_units if u.unit_type == "function"] - classes = [u.name for u in interface_units if u.unit_type == "class"] + functions = [u.name for u in interface_units if u.unit_type in {"function", "method"}] + classes = [ + u.name for u in interface_units + if u.unit_type in {"class", "struct", "interface", "type", "enum"} + ] + declarations = [f"{u.unit_type} {u.name}" for u in interface_units] return True, "", { "functions": functions, "classes": classes, + "declarations": declarations, "features": features, "units": interface_units # Include CodeUnit objects } @@ -1114,7 +1179,8 @@ def __init__( max_iterations: int = 10, logger: Optional[logging.Logger] = None, trajectory: Optional[Any] = None, - step_id: Optional[int] = None + step_id: Optional[int] = None, + target_language: Optional[str] = None, ): # Create LLMClient with trajectory support if not provided if llm_client is None: @@ -1126,6 +1192,7 @@ def __init__( self.llm.set_trajectory(trajectory, step_id) self.max_iterations = max_iterations self.logger = logger or logging.getLogger(__name__) + self.backend = get_backend(target_language) def design_file_interface( self, @@ -1164,7 +1231,7 @@ def design_file_interface( feature_interface_map = {} # Build system prompt (tool description is now integrated) - system_prompt = INTERFACE_PROMPT + system_prompt = with_language_directive(INTERFACE_PROMPT, self.backend) # Build user prompt features_str = "\n".join([f"- {f}" for f in file_features]) @@ -1177,7 +1244,7 @@ def design_file_interface( {features_str} - When calling `design_itfs_for_feature`, ONLY use feature paths listed above. - Do NOT introduce new/unspecified feature paths. -- Define interfaces only (imports + signature + docstring + `pass`). +- Define interfaces only (imports + target-language declaration stubs + target-language documentation). - Prefer one function/class per feature or a small group of closely related features. - Keep each interface focused and with narrow responsibility. - You MAY import and reuse symbols from upstream context and base classes. @@ -1240,12 +1307,17 @@ def design_file_interface( valid_interfaces = [] for interface in interfaces: is_valid, error, info = validate_interface( - interface, target_features, covered_features + interface, + target_features, + covered_features, + backend=self.backend, ) if is_valid: # Add name field from parsed info - if info.get("classes"): + if info.get("declarations"): + interface["name"] = info["declarations"][0] + elif info.get("classes"): interface["name"] = f"class {info['classes'][0]}" elif info.get("functions"): interface["name"] = f"function {info['functions'][0]}" @@ -1380,7 +1452,8 @@ def __init__( max_iterations: int = 10, logger: Optional[logging.Logger] = None, trajectory: Optional[Any] = None, - step_id: Optional[int] = None + step_id: Optional[int] = None, + target_language: Optional[str] = None, ): if llm_client is None: self.llm = LLMClient(trajectory=trajectory, step_id=step_id) @@ -1390,6 +1463,7 @@ def __init__( self.llm.set_trajectory(trajectory, step_id) self.max_iterations = max_iterations self.logger = logger or logging.getLogger(__name__) + self.backend = get_backend(target_language) def design_subtree_interfaces( self, @@ -1442,8 +1516,15 @@ def design_subtree_interfaces( self.logger.warning("[SubtreeInterfaceAgent] No files with features to design") return {} + if self._should_use_c_family_verification_fallback(subtree_name): + for file_path in file_order: + state = file_states.get(file_path) + if state is not None: + self._complete_remaining_c_family_features(file_path, state) + return self._build_subtree_results(file_order, file_states) + # Build system prompt (tool description is now integrated) - system_prompt = SUBTREE_INTERFACE_PROMPT + system_prompt = with_language_directive(SUBTREE_INTERFACE_PROMPT, self.backend) last_error = "" @@ -1516,12 +1597,17 @@ def design_subtree_interfaces( for interface in file_block.interfaces: iface_dict = interface.model_dump() is_valid, error, info = validate_interface( - iface_dict, target_features, covered_features + iface_dict, + target_features, + covered_features, + backend=self.backend, ) if is_valid: # Add name from parsed info - if info.get("classes"): + if info.get("declarations"): + iface_dict["name"] = info["declarations"][0] + elif info.get("classes"): iface_dict["name"] = f"class {info['classes'][0]}" elif info.get("functions"): iface_dict["name"] = f"function {info['functions'][0]}" @@ -1574,7 +1660,14 @@ def design_subtree_interfaces( self.logger.error(f"[SubtreeInterfaceAgent] Error: {e}") last_error = str(e) - # Build final results for each file + return self._build_subtree_results(file_order, file_states) + + def _build_subtree_results( + self, + file_order: List[str], + file_states: Dict[str, Dict[str, Any]], + ) -> Dict[str, Dict[str, Any]]: + """Build final subtree results from accumulated file states.""" results: Dict[str, Dict[str, Any]] = {} all_new_features: List[Dict[str, str]] = [] @@ -1583,6 +1676,7 @@ def design_subtree_interfaces( continue state = file_states[file_path] + self._complete_remaining_c_family_features(file_path, state) file_result, new_features = self._build_file_result( file_path=file_path, all_interfaces=state["all_interfaces"], @@ -1599,6 +1693,112 @@ def design_subtree_interfaces( results["__new_features__"] = all_new_features return results + + def _should_use_c_family_verification_fallback(self, subtree_name: str) -> bool: + """Return whether C-family verification interfaces should be deterministic.""" + if self.backend.name not in {"c", "cpp"}: + return False + normalized = subtree_name.casefold() + return "verification" in normalized or "test" in normalized + + def _complete_remaining_c_family_features( + self, + file_path: str, + state: Dict[str, Any], + ) -> None: + """Add deterministic C/C++ declarations for uncovered features.""" + if self.backend.name not in {"c", "cpp"}: + return + target_features = state.get("target_features", set()) + covered_features = state.get("covered_features", set()) + remaining_features = sorted(target_features - covered_features) + if not remaining_features: + return + + declaration_code = self._fallback_declaration_code( + file_path=file_path, + features=remaining_features, + ) + interface = { + "features": remaining_features, + "code": declaration_code, + "dependencies": { + "inherits_from": [], + "calls": [], + "uses_types": [], + }, + } + is_valid, error, info = validate_interface( + interface, + target_features, + covered_features, + backend=self.backend, + ) + if not is_valid: + self.logger.warning( + "[SubtreeInterfaceAgent] Deterministic %s completion failed for %s: %s", + self.backend.display_name, + file_path, + error, + ) + return + + if info.get("declarations"): + interface["name"] = info["declarations"][0] + elif info.get("classes"): + interface["name"] = f"class {info['classes'][0]}" + elif info.get("functions"): + interface["name"] = f"function {info['functions'][0]}" + interface["parsed_units"] = info.get("units", []) + + state["all_interfaces"].append(interface) + state["all_code_blocks"].append(declaration_code) + covered_features.update(interface.get("features", [])) + self.logger.info( + "[SubtreeInterfaceAgent] Added deterministic %s interface for %s (%d feature%s)", + self.backend.display_name, + file_path, + len(interface.get("features", [])), + "" if len(interface.get("features", [])) == 1 else "s", + ) + + def _fallback_declaration_code(self, file_path: str, features: List[str]) -> str: + """Return a parseable C-family declaration covering ``features``.""" + function_name = self._fallback_function_name(file_path, features) + feature_lines = "\n".join(f" * - {feature}" for feature in features) + if self.backend.name == "c": + return ( + "/**\n" + " * Declares the remaining interface contract for:\n" + f"{feature_lines}\n" + " *\n" + " * Returns:\n" + " * int status code supplied by the implementation.\n" + " */\n" + f"int {function_name}(void);\n" + ) + return ( + "namespace tasklite {\n" + "namespace generated {\n" + "/// Declares the remaining interface contract for:\n" + + "\n".join(f"/// - {feature}" for feature in features) + + "\n" + f"bool {function_name}();\n" + "} // namespace generated\n" + "} // namespace tasklite\n" + ) + + def _fallback_function_name(self, file_path: str, features: List[str]) -> str: + """Build a stable C-family function name from file and feature paths.""" + path_stem = Path(file_path).stem + feature_tail = "_".join(feature.rsplit("/", 1)[-1] for feature in features) + raw_name = f"{path_stem}_{feature_tail}" + cleaned = re.sub(r"[^A-Za-z0-9_]+", "_", raw_name).strip("_").lower() + if not cleaned: + cleaned = "generated_interface" + if cleaned[:1].isdigit(): + cleaned = f"_{cleaned}" + return self.backend.sanitize_module_identifier(cleaned) def _build_subtree_user_prompt( self, @@ -1644,7 +1844,7 @@ def _build_subtree_user_prompt( completed_parts.append( f"File: `{file_path}` (already designed)\n" - f"```python\n{code_preview}\n```" + f"```{self.backend.markdown_fence}\n{code_preview}\n```" ) completed_context = ( @@ -1655,7 +1855,7 @@ def _build_subtree_user_prompt( # Assemble user prompt # Detect import convention from file paths import_convention = "" - if remaining_files: + if remaining_files and self.backend.name == "python": # Infer prefix from file paths in this subtree sample_path = remaining_files[0] parts = sample_path.replace("\\", "/").split("/") @@ -1802,7 +2002,8 @@ def __init__( logger: Optional[logging.Logger] = None, trajectory: Optional[Any] = None, step_id: Optional[int] = None, - output_path: Optional[str] = None + output_path: Optional[str] = None, + target_language: Optional[str] = None, ): # Create LLMClient with trajectory support if not provided if llm_client is None: @@ -1818,6 +2019,7 @@ def __init__( self.trajectory = trajectory self.step_id = step_id self.output_path = output_path + self.backend = get_backend(target_language) def design_all_interfaces( self, @@ -1851,6 +2053,11 @@ def design_all_interfaces( self.logger.info(f"[InterfaceOrchestrator] Processing {len(subtree_order)} subtrees") self.logger.info(f"[InterfaceOrchestrator] Subtree order: {subtree_order}") + print( + f"[InterfaceOrchestrator] Subtrees to process: {len(subtree_order)} " + f"({', '.join(subtree_order)})", + flush=True, + ) # Format base classes and data structures together for prompt context base_classes_str = format_base_classes_and_data_structures( @@ -1864,25 +2071,59 @@ def design_all_interfaces( ) # --- Initialize GlobalInterfaceRegistry --- - global_registry = GlobalInterfaceRegistry() + global_registry = GlobalInterfaceRegistry(backend=self.backend) # Track state across subtrees all_interfaces = {} implemented_subtrees = {} # subtree -> list of implemented file info all_import_warnings = [] # collect import cross-validation warnings all_new_features = [] # collect new features created across all subtrees + coverage_status = self._new_coverage_status() + restored_subtrees = self._restore_completed_subtrees( + skeleton=skeleton, + subtree_order=subtree_order, + all_interfaces=all_interfaces, + implemented_subtrees=implemented_subtrees, + coverage_status=coverage_status, + global_registry=global_registry, + ) + if restored_subtrees: + restored_in_order = [name for name in subtree_order if name in restored_subtrees] + print( + f"[InterfaceOrchestrator] Restored completed subtrees: " + f"{len(restored_subtrees)}/{len(subtree_order)} " + f"({', '.join(restored_in_order)})", + flush=True, + ) # Process each subtree - for subtree_name in subtree_order: + for subtree_index, subtree_name in enumerate(subtree_order, start=1): + if subtree_name in restored_subtrees: + self.logger.info( + f"[InterfaceOrchestrator] Reusing completed subtree: {subtree_name}" + ) + self._print_coverage_progress( + coverage_status, + len(all_interfaces), + len(subtree_order), + ) + continue self.logger.info(f"[InterfaceOrchestrator] Processing subtree: {subtree_name}") # Find files for this subtree file_nodes = self._find_files_for_subtree(skeleton, subtree_name) if not file_nodes: self.logger.warning(f"No files found for subtree: {subtree_name}") + self._record_missing_subtree(coverage_status, subtree_name) continue self.logger.info(f"[InterfaceOrchestrator] Found {len(file_nodes)} files for {subtree_name}") + print( + f"[InterfaceOrchestrator] Subtree {subtree_index}/{len(subtree_order)}: " + f"{subtree_name} ({len(file_nodes)} files, " + f"{self._subtree_feature_count(file_nodes)} features)", + flush=True, + ) # --- Merge global registry symbols into base_class_files --- # This allows DependencyCollector to resolve cross-subtree callees @@ -1909,44 +2150,21 @@ def design_all_interfaces( agent = SubtreeInterfaceAgent( llm_client=self.llm, max_iterations=self.max_file_iterations, - logger=self.logger + logger=self.logger, + target_language=self.backend.name, ) - # Layer-2 retry: if the agent's internal 10-iteration loop - # leaves any file with no units, give the whole subtree ONE - # second chance. This is the simple variant — attempt 2 - # reruns the entire subtree (not just failed files). The - # cost (extra LLM round) is bounded and only paid when at - # least one file actually failed, which is rare in practice. - max_subtree_attempts = 2 - file_results: Dict[str, Any] = {} - for attempt in range(max_subtree_attempts): - file_results = agent.design_subtree_interfaces( - file_nodes=file_nodes, - file_order=file_order, - repo_info=repo_info, - data_flow_str=filtered_data_flow_str, - base_classes_str=base_classes_str, - upstream_context=upstream_context, - dependency_collector=dependency_collector, - base_class_files=base_class_files, - subtree_name=subtree_name, - ) - failed_paths = [ - fp for fp, r in file_results.items() - if fp != "__new_features__" - and isinstance(r, dict) - and not r.get("units") - ] - if not failed_paths: - break - if attempt + 1 < max_subtree_attempts: - self.logger.warning( - f"[InterfaceOrchestrator] Subtree '{subtree_name}' " - f"left {len(failed_paths)} file(s) without units " - f"after attempt {attempt + 1}/{max_subtree_attempts}; " - f"retrying whole subtree once. Failed: {failed_paths[:5]}" - ) + file_results = agent.design_subtree_interfaces( + file_nodes=file_nodes, + file_order=file_order, + repo_info=repo_info, + data_flow_str=filtered_data_flow_str, + base_classes_str=base_classes_str, + upstream_context=upstream_context, + dependency_collector=dependency_collector, + base_class_files=base_class_files, + subtree_name=subtree_name, + ) # Extract new features from this subtree subtree_new_features = file_results.pop("__new_features__", []) @@ -1985,6 +2203,14 @@ def design_all_interfaces( self.logger.info(f"[InterfaceOrchestrator] [OK] Completed {file_path}") else: self.logger.warning(f"[InterfaceOrchestrator] [FAIL] Failed {file_path}") + + for file_node in file_nodes: + self._record_file_coverage( + coverage_status=coverage_status, + subtree_name=subtree_name, + file_node=file_node, + result=file_results.get(file_node.get("path", "")), + ) # --- A1: Register completed subtree interfaces to GlobalInterfaceRegistry --- global_registry.register_from_subtree_result(subtree_name, subtree_interfaces) @@ -2008,7 +2234,8 @@ def design_all_interfaces( code=file_code, file_path=file_path, declared_calls=list(declared_calls), - global_registry=global_registry + global_registry=global_registry, + backend=self.backend, ) if warnings: all_import_warnings.extend(warnings) @@ -2026,11 +2253,26 @@ def design_all_interfaces( # Save after each subtree self._save_interfaces( - self._build_result(all_interfaces, subtree_order, implemented_subtrees) + self._build_result( + all_interfaces, + subtree_order, + implemented_subtrees, + coverage_status, + ) + ) + self._print_coverage_progress( + coverage_status, + len(all_interfaces), + len(subtree_order), ) # Compile final result - final_result = self._build_result(all_interfaces, subtree_order, implemented_subtrees) + final_result = self._build_result( + all_interfaces, + subtree_order, + implemented_subtrees, + coverage_status, + ) # Store import warnings and global registry in result for downstream use final_result["_import_warnings"] = all_import_warnings @@ -2051,18 +2293,107 @@ def _build_result( self, all_interfaces: Dict[str, Any], subtree_order: List[str], - implemented_subtrees: Dict[str, List[Dict[str, Any]]] + implemented_subtrees: Dict[str, List[Dict[str, Any]]], + coverage_status: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Build the result dict from current state.""" + coverage = coverage_status or self._new_coverage_status() return { + "meta": { + "primary_language": self.backend.name, + "target_languages": [self.backend.name], + }, "subtrees": all_interfaces, "subtree_order": subtree_order, "implemented_subtrees": { st: [f["path"] for f in files] for st, files in implemented_subtrees.items() }, - "success": True + "coverage": coverage, + "success": not coverage.get("issues"), } + + @staticmethod + def _new_coverage_status() -> Dict[str, Any]: + """Return an empty coverage accumulator for interface generation.""" + return { + "expected_files": 0, + "successful_files": 0, + "expected_features": 0, + "covered_features": 0, + "missing_features": 0, + "failed_files": [], + "missing_subtrees": [], + "issues": [], + } + + @staticmethod + def _features_from_file_result(result: Dict[str, Any]) -> Set[str]: + """Extract feature paths mapped by a generated file result.""" + features: Set[str] = set() + for mapped_features in (result.get("units_to_features") or {}).values(): + if isinstance(mapped_features, list): + features.update(str(feature) for feature in mapped_features) + elif isinstance(mapped_features, str): + features.add(mapped_features) + return features + + @staticmethod + def _record_missing_subtree( + coverage_status: Dict[str, Any], + subtree_name: str, + ) -> None: + """Record a subtree referenced by data flow but absent from skeleton.""" + coverage_status["missing_subtrees"].append(subtree_name) + coverage_status["issues"].append({ + "subtree": subtree_name, + "file_path": None, + "reason": "subtree has no skeleton files", + "missing_features": [], + }) + + @classmethod + def _record_file_coverage( + cls, + coverage_status: Dict[str, Any], + subtree_name: str, + file_node: Dict[str, Any], + result: Optional[Dict[str, Any]], + ) -> None: + """Record generated interface coverage for one skeleton file.""" + file_path = file_node.get("path", "") + expected_features = set(file_node.get("feature_paths", [])) + if not expected_features: + return + + coverage_status["expected_files"] += 1 + coverage_status["expected_features"] += len(expected_features) + + produced_features = cls._features_from_file_result(result or {}) + covered_features = expected_features & produced_features + missing_features = sorted(expected_features - produced_features) + has_units = bool(result and result.get("units")) + + coverage_status["covered_features"] += len(covered_features) + coverage_status["missing_features"] += len(missing_features) + + if has_units and not missing_features: + coverage_status["successful_files"] += 1 + return + + reason = "missing features" + if not result: + reason = "no result" + elif not has_units: + reason = "no units" + + coverage_status["failed_files"].append(file_path) + coverage_status["issues"].append({ + "subtree": subtree_name, + "file_path": file_path, + "reason": reason, + "missing_features": missing_features, + }) def _save_interfaces(self, result: Dict[str, Any]) -> None: """Save current interfaces result to output_path (if configured). @@ -2085,6 +2416,148 @@ def _save_interfaces(self, result: Dict[str, Any]) -> None: self.logger.info(f"[InterfaceOrchestrator] Saved interfaces to {output}") except Exception as e: self.logger.warning(f"[InterfaceOrchestrator] Failed to save interfaces: {e}") + + @staticmethod + def _subtree_feature_count(file_nodes: List[Dict[str, Any]]) -> int: + """Return the number of distinct feature paths assigned to files.""" + features: Set[str] = set() + for file_node in file_nodes: + features.update(file_node.get("feature_paths", [])) + return len(features) + + @staticmethod + def _print_coverage_progress( + coverage_status: Dict[str, Any], + processed_subtrees: int, + total_subtrees: int, + ) -> None: + """Print compact progress for long-running interface generation.""" + expected_features = coverage_status.get("expected_features", 0) + covered_features = coverage_status.get("covered_features", 0) + issue_count = len(coverage_status.get("issues", []) or []) + print( + f"[InterfaceOrchestrator] Progress: {processed_subtrees}/{total_subtrees} " + f"subtrees, {covered_features}/{expected_features} processed features " + f"covered, issues={issue_count}", + flush=True, + ) + + def _load_existing_interfaces(self) -> Optional[Dict[str, Any]]: + """Load an existing interfaces file for subtree-level resume.""" + if not self.output_path: + return None + path = Path(self.output_path) + if not path.exists(): + return None + try: + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + except Exception as exc: + self.logger.warning( + f"[InterfaceOrchestrator] Failed to load existing interfaces: {exc}" + ) + return None + return data if isinstance(data, dict) else None + + def _restore_completed_subtrees( + self, + skeleton: Dict[str, Any], + subtree_order: List[str], + all_interfaces: Dict[str, Any], + implemented_subtrees: Dict[str, List[Dict[str, Any]]], + coverage_status: Dict[str, Any], + global_registry: "GlobalInterfaceRegistry", + ) -> Set[str]: + """Restore a contiguous prefix of complete subtrees from output_path.""" + existing = self._load_existing_interfaces() + if not existing: + return set() + + restored: Set[str] = set() + existing_subtrees = existing.get("subtrees") or {} + if not isinstance(existing_subtrees, dict): + return restored + + for subtree_name in subtree_order: + subtree_data = existing_subtrees.get(subtree_name) + if not isinstance(subtree_data, dict): + break + + file_nodes = self._find_files_for_subtree(skeleton, subtree_name) + file_container = subtree_data.get( + "interfaces", + subtree_data.get("files", {}), + ) + if not isinstance(file_container, dict): + break + if not self._subtree_interfaces_complete(file_nodes, file_container): + break + + all_interfaces[subtree_name] = { + "files_order": subtree_data.get("files_order") + or [node.get("path", "") for node in file_nodes], + "interfaces": file_container, + } + implemented_subtrees[subtree_name] = self._implemented_files_from_existing( + file_nodes, + file_container, + ) + for file_node in file_nodes: + self._record_file_coverage( + coverage_status=coverage_status, + subtree_name=subtree_name, + file_node=file_node, + result=file_container.get(file_node.get("path", "")), + ) + global_registry.register_from_subtree_result(subtree_name, file_container) + restored.add(subtree_name) + + if restored: + self.logger.info( + f"[InterfaceOrchestrator] Restored {len(restored)} completed subtree(s): " + f"{sorted(restored)}" + ) + return restored + + @classmethod + def _subtree_interfaces_complete( + cls, + file_nodes: List[Dict[str, Any]], + file_container: Dict[str, Any], + ) -> bool: + """Return True when existing subtree interfaces cover all features.""" + expected_features: Set[str] = set() + for file_node in file_nodes: + expected_features.update(file_node.get("feature_paths", [])) + if not expected_features: + return False + + produced_features: Set[str] = set() + for result in file_container.values(): + if isinstance(result, dict): + produced_features.update(cls._features_from_file_result(result)) + return expected_features <= produced_features + + @staticmethod + def _implemented_files_from_existing( + file_nodes: List[Dict[str, Any]], + file_container: Dict[str, Any], + ) -> List[Dict[str, Any]]: + """Build implemented_subtrees entries from restored interface data.""" + implemented: List[Dict[str, Any]] = [] + for file_node in file_nodes: + file_path = file_node.get("path", "") + result = file_container.get(file_path) + if not isinstance(result, dict) or not result.get("units"): + continue + implemented.append({ + "path": file_path, + "features": file_node.get("feature_paths", []), + "code": result.get("file_code", ""), + "units": result.get("units", []), + "units_to_features": result.get("units_to_features", {}), + }) + return implemented def _build_base_class_files_mapping( self, @@ -2111,17 +2584,16 @@ def _build_base_class_files_mapping( if not file_path or not code: continue - # Parse code to extract class and type names - try: - tree = ast.parse(code) - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - mapping[node.name] = file_path - elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - # Top-level functions might be utilities - mapping[node.name] = file_path - except SyntaxError: - continue + # Parse code through the target-language backend so declaration + # discovery is shared with other interface-analysis paths. + # Syntax errors yield an empty unit list. + for unit in self.backend.list_code_units(code, file_path): + if unit.unit_type == "class": + mapping[unit.name] = file_path + elif unit.unit_type in ("function", "method"): + # Map every function-like name so nested callable + # declarations can still satisfy dependency lookups. + mapping[unit.name] = file_path # Process data structures (only those with file_path already assigned) if data_structures: @@ -2132,13 +2604,12 @@ def _build_base_class_files_mapping( if not file_path or not code: continue - try: - tree = ast.parse(code) - for node in ast.walk(tree): - if isinstance(node, ast.ClassDef): - mapping[node.name] = file_path - except SyntaxError: - continue + # Parse through the target-language backend to share class + # discovery with interface dependency analysis. Syntax + # errors yield an empty unit list. + for unit in self.backend.list_code_units(code, file_path): + if unit.unit_type == "class": + mapping[unit.name] = file_path # Also map data_flow_types names to file paths for dt_name in ds.get("data_flow_types", []): diff --git a/CoderMind/scripts/func_design/interface_prompts.py b/CoderMind/scripts/func_design/interface_prompts.py index a057002..a9f7e0d 100644 --- a/CoderMind/scripts/func_design/interface_prompts.py +++ b/CoderMind/scripts/func_design/interface_prompts.py @@ -9,27 +9,27 @@ # ============================================================================ INTERFACE_PROMPT = """ -You are designing interfaces (functions or classes) for a large, production-oriented Python repository. +You are designing interfaces (functions, classes, structs, interfaces, or methods) for a large, production-oriented target-language repository. The goal is not to write arbitrary APIs, but to define interfaces that integrate cleanly into the repository's architecture, respect existing data flows, and follow established conventions for modules, base classes, and shared data structures. ## Objective For each invocation: 1. Select exactly one assigned feature, or a small group of closely related features. -2. Define exactly one public interface for it (either a function or a class). +2. Define exactly one public target-language interface for it. 3. Provide the following elements: - All required imports: - standard library imports - external dependency imports - internal project imports - The interface definition: - - function or class signature only - - no implementation logic (function and method bodies must contain only `pass`) - - A precise docstring documenting: + - target-language declaration stubs only + - no implementation logic; function and method bodies must use a parseable target-language placeholder + - Precise target-language documentation comments or docstrings documenting: - purpose and intended usage context within the repository - parameters, including names, types, and semantics - return type and meaning - assumptions, constraints, error conditions, and edge cases -4. Do not generate placeholder logic or pseudo-implementation. Only define signatures and `pass`. +4. Do not generate implementation logic or pseudo-implementation. 5. Interface design is incremental. Each round may define one or a small number of interfaces, but each must be self-contained and justified. ## Repository Context and Constraints @@ -41,15 +41,15 @@ 5. Avoid speculative abstractions that are unrelated to the repository's direction. Interfaces should feel like natural extensions of the repository, not isolated standalone utilities. -## Function vs Class Decision Rules -A function is appropriate when: +## Interface Shape Decision Rules +A function or free operation is appropriate when: - the operation is conceptually a single computation or transformation, - the logic is stateless, - configuration is provided entirely by parameters, - the operation does not manage lifecycle or persistent state. - Helper functions are permitted, but only when they clearly support higher-level components rather than replacing them. -A class is appropriate when: +A class, struct, interface, trait, type, or receiver-backed method set is appropriate when: - configuration persists across multiple calls, - internal state influences behavior, - multiple related operations belong together, @@ -113,7 +113,7 @@ "fully/qualified/feature/path_1", "fully/qualified/feature/path_2" ], - "code": "Python code string with imports, class/function signature, docstring, and pass body", + "code": "Target-language code string with imports and declaration stubs", "dependencies": {{ "inherits_from": ["BaseClassName"], "calls": ["function_or_method_name"], @@ -126,9 +126,9 @@ Constraints: - One interface per code string, covering one feature or a tight group of related features. -- The code must define either one top-level function OR one top-level class (with zero or more methods). -- All function/method bodies must use `pass`. -- Public functions and classes must have docstrings. +- The code must define exactly one cohesive target-language declaration group. +- Function and method bodies must use a parseable target-language placeholder and contain no implementation logic. +- Public declarations must have target-language documentation comments or docstrings. - Prefer explicit, custom containers and typed structures; do not use pandas.DataFrame or other third-party tabular types. """.strip() @@ -138,7 +138,7 @@ # ============================================================================ SUBTREE_INTERFACE_PROMPT = """ -You are designing interfaces (functions or classes) for a large, production-oriented Python repository. +You are designing interfaces (functions, classes, structs, interfaces, or methods) for a large, production-oriented target-language repository. The goal is to define interfaces that integrate cleanly into the repository's architecture, respect existing data flows, and follow established conventions. ## Objective @@ -151,15 +151,15 @@ 2. Each interface covers one feature or a small group of closely related features. 3. For each interface, provide: - Required imports (standard library, external, internal project) - - The interface definition: function or class signature only, with `pass` bodies (no implementation logic) - - A docstring covering: purpose, parameters with types and semantics, return type, and notable constraints or edge cases + - The interface definition: target-language declaration stubs with no implementation logic + - Target-language documentation comments or docstrings covering: purpose, parameters with types and semantics, return type, and notable constraints or edge cases 4. You MAY import and reuse symbols from upstream context, base classes, and earlier files in this batch. 5. **Glue/Orchestration Code**: If you need to create orchestrator classes, manager facades, or data structures that integrate multiple features but don't map to any assigned feature, you MAY create NEW feature paths for them. Simply include these new feature paths in the `features` field. New feature paths should follow the same naming convention as existing ones (e.g., "Subtree Name/category/feature name"). ## Design Guidelines -### Function vs Class -Use a **function** for stateless, single-operation computations where all configuration is provided by parameters. -Use a **class** when state persists across calls, multiple related operations belong together, or subclassing/pluggable behavior is expected. +### Interface Shape +Use a free function for stateless, single-operation computations where all configuration is provided by parameters. +Use a class, struct, interface, trait, type, or receiver-backed method set when state persists across calls, multiple related operations belong together, or pluggable behavior is expected. ### Cohesion and Grouping - Each interface must correspond to a single coherent responsibility. @@ -214,7 +214,7 @@ "interfaces": [ {{ "features": ["fully/qualified/feature/path_1", "fully/qualified/feature/path_2"], - "code": "Python code string with imports, class/function signature, docstring, and pass body", + "code": "Target-language code string with imports and declaration stubs", "dependencies": {{ "inherits_from": ["BaseClassName"], "calls": ["function_or_method_name"], @@ -229,9 +229,9 @@ Constraints: - file_path must match exactly one of the file paths specified in the task. -- One interface per code string: either one top-level function OR one top-level class. -- All function/method bodies must use `pass`. -- Public functions and classes must have docstrings. +- One interface per code string: exactly one cohesive target-language declaration group. +- Function and method bodies must use a parseable target-language placeholder and contain no implementation logic. +- Public declarations must have target-language documentation comments or docstrings. - For most interfaces, use the assigned feature paths from the task. - For glue/orchestration code that doesn't map to any assigned feature, you may create NEW feature paths following the naming convention: "Subtree Name/category/feature name". """.strip() @@ -242,7 +242,7 @@ # ============================================================================ PLAN_FILE_PROMPT = """ -You are an expert software architect assisting in planning feature implementation within a Python codebase. +You are an expert software architect assisting in planning feature implementation within a target-language codebase. Your task is to construct an **implementation dependency graph** across a set of files that collectively realize a functional subtree of the system. Each file corresponds to one or more feature paths. These features may have logical dependencies derived from the feature hierarchy and standard software layering principles. diff --git a/CoderMind/scripts/func_design/interface_review.py b/CoderMind/scripts/func_design/interface_review.py index 33038f1..e23d656 100644 --- a/CoderMind/scripts/func_design/interface_review.py +++ b/CoderMind/scripts/func_design/interface_review.py @@ -13,10 +13,9 @@ import json import logging -import ast -from collections import defaultdict, deque +from collections import defaultdict from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple, Any, Set +from typing import Callable, Dict, List, Optional, Tuple, Any, Set import sys from pathlib import Path @@ -24,22 +23,74 @@ from common import LLMClient +# AST inspection routes through the Python backend's +# ``find_main_block_lineno`` helper so entry-point splicing shares the +# same parser abstraction as the rest of interface design. +from decoder_lang import get_backend +from decoder_lang.prompt_directive import with_language_directive + from .interface_agent import ( GlobalInterfaceRegistry, DependencyCollector, - cross_validate_imports_vs_calls, ) from .interface_prompts import ORPHAN_REVIEW_PROMPT logger = logging.getLogger(__name__) +# ============================================================================ +# Non-production feature classification +# ============================================================================ + +# Top-level feature categories whose units are driven by an EXTERNAL runner +# rather than by repository code: test functions are discovered and invoked +# by the test runner, build targets by ``make`` / ``cmake``. Such a unit +# legitimately has no incoming *invocation* edge in the production call +# graph, so the "no incoming edge => dead code" orphan heuristic is a false +# positive for it — exactly like the type-like case handled by ``is_callable``, +# but along an orthogonal axis (the unit IS callable, it is just called from +# outside the graph). +# +# Categories are matched on the leading segment of a feature path +# (``"Testing/error reporting/..."`` -> ``"testing"``) and on the subtree +# name, both lower-cased. The set is language-agnostic: the planner emits +# these category names independently of the target language. +NON_PRODUCTION_FEATURE_CATEGORIES: frozenset[str] = frozenset({ + "testing", "test", "tests", "test infrastructure", "test suite", + "build system", "build", "build configuration", + "tooling", "ci", "cd", "ci/cd", +}) + + +def _is_non_production_feature( + features: Optional[List[str]], + subtree: str = "", +) -> bool: + """Return True when a feature-bearing unit belongs to a test/build category. + + Such units are invoked by an external driver (test runner, ``make``), + not by repository code, so a missing incoming invocation edge is not a + coverage gap. Matching is on the leading segment of each feature path + and on the subtree name, both lower-cased, against + :data:`NON_PRODUCTION_FEATURE_CATEGORIES`. Language-agnostic. + """ + if subtree and subtree.strip().lower() in NON_PRODUCTION_FEATURE_CATEGORIES: + return True + for feature in features or (): + if not isinstance(feature, str): + continue + head = feature.split("/", 1)[0].strip().lower() + if head in NON_PRODUCTION_FEATURE_CATEGORIES: + return True + return False + + # ============================================================================ # Global Review Prompt # ============================================================================ GLOBAL_INTERFACE_REVIEW_PROMPT = """ -You are a senior software engineer reviewing the COMPLETE set of interfaces for an entire Python repository. +You are a senior software engineer reviewing the COMPLETE set of interfaces for an entire repository. All subtrees have been designed. Your task is to review the interfaces holistically, focusing on CROSS-MODULE integration — not individual interface quality. @@ -64,8 +115,8 @@ This explicitly includes (do not omit these): * HTTP route handlers (Flask `@route` / FastAPI / Django view functions) — even if not decorated in the signature, any unit whose role is "respond - to an HTTP request" is invoked by the web framework, not by other Python - code. Always mark these as entry_points. + to an HTTP request" is invoked by the web framework, not by other code + in the project. Always mark these as entry_points. * CLI subcommand handlers (click commands, argparse callbacks). * Event / signal subscribers, background workers, scheduled job entry functions, message-queue consumers. @@ -178,9 +229,10 @@ immediately be reported as orphan). Rules for `modify_interface`: -- This action has no auto-handler. Use it sparingly and only for true - architectural issues (e.g. breaking a circular import). Each such request - will be recorded as `unapplied_fixes` and will block `passed=true`. +- This action has no auto-handler: it is recorded as an advisory + `unapplied_fix` for manual follow-up and does NOT block `passed`. Use it + sparingly and only for true architectural issues (e.g. breaking a + circular import). - Do NOT use modify_interface for cases solvable by add_dependency or add_interface. @@ -273,9 +325,6 @@ def build_call_graph( for edge in enhanced_data_flow.get("inheritance_edges", []): child = edge.get("child", "") parent = edge.get("parent", "") - source_file = edge.get("source_file", "") - parent_file = edge.get("parent_file", "") - child_candidates = name_to_keys.get(child, []) parent_candidates = name_to_keys.get(parent, []) @@ -289,8 +338,6 @@ def build_call_graph( for edge in enhanced_data_flow.get("reference_edges", []): unit = edge.get("unit", "") ref_type = edge.get("referenced_type", "") - source_file = edge.get("source_file", "") - unit_candidates = name_to_keys.get(unit, []) type_candidates = name_to_keys.get(ref_type, []) @@ -303,43 +350,181 @@ def build_call_graph( return dict(outgoing), dict(incoming), unit_to_file +def _build_unit_feature_index( + interfaces_data: Dict[str, Any], +) -> Dict[str, Tuple[List[str], str]]: + """Map ``file_path::unit_name`` -> (feature paths, subtree name). + + Lets the orphan checks consult a unit's feature category without + re-walking the subtree tree per unit. Units absent from the index + (no ``units_to_features`` entry) are treated as production code by the + callers (the conservative default keeps real dead code detectable). + """ + index: Dict[str, Tuple[List[str], str]] = {} + subtrees = interfaces_data.get("subtrees", {}) + for subtree_name, subtree_data in subtrees.items(): + file_interfaces = subtree_data.get("interfaces", subtree_data.get("files", {})) + for file_path, file_data in file_interfaces.items(): + units_to_features = file_data.get("units_to_features", {}) + for unit_name, features in units_to_features.items(): + index[f"{file_path}::{unit_name}"] = ( + features if isinstance(features, list) else [], + subtree_name, + ) + return index + + +def _unit_name_aliases(unit_name: str) -> Set[str]: + """Return comparable aliases for an interface unit name.""" + raw = unit_name.strip() + if not raw: + return set() + + aliases = {raw} + if " " in raw: + aliases.add(raw.split(" ", 1)[1].strip()) + + expanded = set() + for alias in aliases: + if not alias: + continue + expanded.add(alias) + if "." in alias: + expanded.add(alias.rsplit(".", 1)[-1]) + if "::" in alias: + expanded.add(alias.rsplit("::", 1)[-1]) + return {alias for alias in expanded if alias} + + +def _build_entry_point_keys( + entry_points: List[Dict[str, Any]], + unit_to_file: Dict[str, str], +) -> Set[str]: + """Resolve LLM entry-point records to concrete unit keys.""" + alias_to_keys: Dict[str, Set[str]] = defaultdict(set) + for unit_key in unit_to_file: + unit_name = unit_key.split("::", 1)[1] if "::" in unit_key else unit_key + for alias in _unit_name_aliases(unit_name): + alias_to_keys[alias].add(unit_key) + + entry_point_keys: Set[str] = set() + for entry_point in entry_points: + entry_file = str(entry_point.get("file_path") or "").strip() + entry_unit = str(entry_point.get("unit_name") or "").strip() + if entry_file and entry_unit: + exact_key = f"{entry_file}::{entry_unit}" + if exact_key in unit_to_file: + entry_point_keys.add(exact_key) + + for alias in _unit_name_aliases(entry_unit): + candidate_keys = alias_to_keys.get(alias, set()) + if not entry_file and len(candidate_keys) > 1: + continue + for unit_key in candidate_keys: + if entry_file and unit_to_file.get(unit_key) != entry_file: + continue + entry_point_keys.add(unit_key) + return entry_point_keys + + +def _is_isolated_orphan( + unit_key: str, + unit_name: str, + file_path: str, + features: List[str], + subtree: str, + incoming: Dict[str, Set[str]], + outgoing: Dict[str, Set[str]], + entry_point_keys: Set[str], + is_callable: Optional[Callable[[str], bool]], + is_test_file: Optional[Callable[[str], bool]], +) -> bool: + """Return True when a unit is a genuinely disconnected production orphan. + + Single source of truth shared by the unit-level connectivity gate and + the feature-coverage gate so the two can never diverge (a past defect + had them disagree, leaving stale orphan counts). A unit is an orphan + only when ALL of the following hold: + + * it is not an entry point; + * it is callable (type-like units are referenced, not invoked, so a + missing incoming *invocation* edge is expected); + * it is production code (test / build units are driven by an external + runner, so a missing incoming edge is not dead code); + * it is completely isolated — no incoming AND no outgoing edge. + + Requiring isolation on BOTH directions (rather than "no incoming") is + what keeps roots / factories / framework callbacks — which have no + static incoming edge but DO call into the graph — from being mistaken + for dead code. The rule is identical for every language; all + language-specific behaviour enters only through the injected + ``is_callable`` / ``is_test_file`` predicates. + """ + if unit_key in entry_point_keys: + return False + if is_callable is not None and not is_callable(unit_name): + return False + if _is_non_production_feature(features, subtree): + return False + if is_test_file is not None and is_test_file(file_path): + return False + has_incoming = bool(incoming.get(unit_key)) + has_outgoing = bool(outgoing.get(unit_key)) + return not has_incoming and not has_outgoing + + def check_call_graph_connectivity( interfaces_data: Dict[str, Any], enhanced_data_flow: Dict[str, Any], - entry_points: List[Dict[str, Any]] + entry_points: List[Dict[str, Any]], + is_callable: Optional[Callable[[str], bool]] = None, + is_test_file: Optional[Callable[[str], bool]] = None, ) -> Dict[str, Any]: """Build a directed graph of all invocation edges and check connectivity. - Identifies orphan units (non-entry-point units with no incoming edges). + Identifies orphan units: *callable* units (functions / methods / + classes) that are completely isolated — no incoming AND no outgoing + edges — and are not entry points. + + ``is_callable`` is a per-language predicate (``backend.is_callable_unit``). + When supplied, type-like units (struct / enum / interface / ...) are + excluded from orphan candidacy: a data structure legitimately has no + incoming *invocation* edge, so flagging it is a false positive. + When ``None`` (legacy callers / tests) every unit is treated as + callable, preserving the previous behaviour. + + ``is_test_file`` is a per-language predicate (``backend.is_test_file``). + When supplied, units in test files are excluded from orphan candidacy + — together with the language-agnostic test/build feature-category + check — because a test or build unit is driven by an external runner, + not by repository code, so its lack of an incoming edge is not dead + code. ``None`` disables the file-level check (the category check still + applies), preserving legacy behaviour. + + Requiring "no outgoing" as well mirrors + :meth:`InterfacesStore.find_orphan_units` so the convergence gate and + the pruning detector share one definition of "orphan". Returns: Dict with keys: orphan_units, total_units, entry_point_count """ outgoing, incoming, unit_to_file = build_call_graph(interfaces_data, enhanced_data_flow) + feature_index = _build_unit_feature_index(interfaces_data) all_units = set(unit_to_file.keys()) - # Build entry point key set - entry_point_keys = set() - for ep in entry_points: - ep_file = ep.get("file_path", "") - ep_unit = ep.get("unit_name", "") - ep_key = f"{ep_file}::{ep_unit}" - if ep_key in all_units: - entry_point_keys.add(ep_key) - else: - # Try fuzzy match - for uk in all_units: - if uk.endswith(f"::{ep_unit}"): - entry_point_keys.add(uk) - break + entry_point_keys = _build_entry_point_keys(entry_points, unit_to_file) - non_entry_units = all_units - entry_point_keys - - # Units with no incoming edges (excluding entry points) + # Orphan = callable + completely isolated (no incoming, no outgoing). orphan_units = [] - for unit_key in non_entry_units: - if unit_key not in incoming or len(incoming[unit_key]) == 0: + for unit_key in all_units: + unit_name = unit_key.split("::", 1)[1] if "::" in unit_key else unit_key + features, subtree = feature_index.get(unit_key, ([], "")) + if _is_isolated_orphan( + unit_key, unit_name, unit_to_file.get(unit_key, ""), + features, subtree, incoming, outgoing, entry_point_keys, + is_callable, is_test_file, + ): orphan_units.append({ "unit_key": unit_key, "file_path": unit_to_file.get(unit_key, ""), @@ -355,26 +540,32 @@ def check_call_graph_connectivity( def check_feature_dependency_coverage( interfaces_data: Dict[str, Any], enhanced_data_flow: Dict[str, Any], - entry_points: List[Dict[str, Any]] + entry_points: List[Dict[str, Any]], + is_callable: Optional[Callable[[str], bool]] = None, + is_test_file: Optional[Callable[[str], bool]] = None, ) -> List[Dict[str, Any]]: - """Check that every feature-bearing unit is either an entry point or has at least one incoming dependency edge. - - Returns: list of orphan features (feature paths without incoming edges - and not in entry points) + """Check that feature-bearing units are not isolated from the graph. + + ``is_callable`` is a per-language predicate (``backend.is_callable_unit``). + When supplied, type-like feature-bearing units (struct / enum / ...) + are excluded: a data structure that carries a feature is "used" by + being referenced, not invoked, so a missing incoming *invocation* + edge is not a coverage gap. When ``None`` every unit is checked + (legacy behaviour). + + ``is_test_file`` is a per-language predicate (``backend.is_test_file``). + When supplied, units in test files are excluded — together with the + language-agnostic test/build feature-category check — because a test + or build unit is invoked by an external runner (test framework / + ``make``), not by repository code, so a missing incoming edge is not a + coverage gap. ``None`` disables the file-level check (the category + check still applies), preserving legacy behaviour. + + Returns: list of orphan features attached to isolated units. """ - _, incoming, unit_to_file = build_call_graph(interfaces_data, enhanced_data_flow) + outgoing, incoming, unit_to_file = build_call_graph(interfaces_data, enhanced_data_flow) - # Build entry point key set - entry_point_keys = set() - for ep in entry_points: - ep_file = ep.get("file_path", "") - ep_unit = ep.get("unit_name", "") - ep_key = f"{ep_file}::{ep_unit}" - entry_point_keys.add(ep_key) - # Also add bare match - for uk in unit_to_file: - if uk.endswith(f"::{ep_unit}"): - entry_point_keys.add(uk) + entry_point_keys = _build_entry_point_keys(entry_points, unit_to_file) orphan_features = [] subtrees = interfaces_data.get("subtrees", {}) @@ -385,13 +576,11 @@ def check_feature_dependency_coverage( units_to_features = file_data.get("units_to_features", {}) for unit_name, features in units_to_features.items(): unit_key = f"{file_path}::{unit_name}" - - # Skip entry points - if unit_key in entry_point_keys: - continue - - # Check if has any incoming edge - if unit_key not in incoming or len(incoming[unit_key]) == 0: + if _is_isolated_orphan( + unit_key, unit_name, file_path, features, subtree_name, + incoming, outgoing, entry_point_keys, + is_callable, is_test_file, + ): orphan_features.append({ "file_path": file_path, "unit_name": unit_name, @@ -417,7 +606,11 @@ def check_feature_dependency_coverage( def _insert_unit_into_file_code(file_code: str, stub: str) -> str: - """Insert ``stub`` into ``file_code`` at a safe location. + """Insert a Python ``stub`` into ``file_code`` at a safe location. + + Only reached for Python projects: ``_apply_fixes`` skips the + ``add_interface`` action (the sole caller) for non-Python backends, + because stub synthesis emits Python ``def``/``class`` syntax. Preferred insertion point is **immediately before** any top-level ``if __name__ == "__main__":`` block, so handler-added units do not @@ -434,27 +627,18 @@ def _insert_unit_into_file_code(file_code: str, stub: str) -> str: if not file_code.strip(): return stub - try: - tree = ast.parse(file_code) - except SyntaxError: - return file_code.rstrip() + "\n\n\n" + stub - - main_node: Optional[ast.If] = None - for node in tree.body: - if ( - isinstance(node, ast.If) - and isinstance(node.test, ast.Compare) - and isinstance(node.test.left, ast.Name) - and node.test.left.id == "__name__" - ): - main_node = node - break + # Locate the module's main guard via the Python backend so a handler- + # added unit is spliced before it. A missing guard (or a parse error) + # falls through to the append branch below. + backend = get_backend("python") + find_main = getattr(backend, "find_main_block_lineno", None) + main_lineno = find_main(file_code) if find_main is not None else None - if main_node is None: + if main_lineno is None: return file_code.rstrip() + "\n\n\n" + stub lines = file_code.splitlines() - insert_at = max(main_node.lineno - 1, 0) # ast.lineno is 1-based + insert_at = max(main_lineno - 1, 0) # ast.lineno is 1-based prefix = lines[:insert_at] suffix = lines[insert_at:] # Ensure separation: one blank line before stub, two blank lines after. @@ -479,11 +663,17 @@ def __init__( llm_client: Optional[LLMClient] = None, trajectory: Optional[Any] = None, step_id: Optional[int] = None, + target_language: Optional[str] = None, ): if llm_client is None: self.llm = LLMClient(trajectory=trajectory, step_id=step_id) else: self.llm = llm_client + # Target-language backend. Structural checks and dependency-edge + # fixes are language-agnostic; only interface-stub synthesis + # (``add_interface``) is Python-specific and is skipped for other + # languages. Defaults to Python so standalone callers are unaffected. + self.backend = get_backend(target_language or "python") self.logger = logging.getLogger(__name__) def review_and_fix( @@ -560,10 +750,14 @@ def review_and_fix( # Step 2: Code-based structural checks connectivity = check_call_graph_connectivity( - interfaces_data, enhanced_data_flow, entry_points + interfaces_data, enhanced_data_flow, entry_points, + is_callable=self.backend.is_callable_unit, + is_test_file=self.backend.is_test_file, ) feature_orphans = check_feature_dependency_coverage( - interfaces_data, enhanced_data_flow, entry_points + interfaces_data, enhanced_data_flow, entry_points, + is_callable=self.backend.is_callable_unit, + is_test_file=self.backend.is_test_file, ) self.logger.info( @@ -634,10 +828,14 @@ def review_and_fix( final_feature_orphans: List[Any] = [] if review_history: final_connectivity = check_call_graph_connectivity( - interfaces_data, enhanced_data_flow, final_entry_points + interfaces_data, enhanced_data_flow, final_entry_points, + is_callable=self.backend.is_callable_unit, + is_test_file=self.backend.is_test_file, ) final_feature_orphans = check_feature_dependency_coverage( - interfaces_data, enhanced_data_flow, final_entry_points + interfaces_data, enhanced_data_flow, final_entry_points, + is_callable=self.backend.is_callable_unit, + is_test_file=self.backend.is_test_file, ) final_orphan_units = final_connectivity["orphan_units"] self.logger.info( @@ -646,16 +844,26 @@ def review_and_fix( f"{len(final_feature_orphans)} orphan feature(s)" ) - # Collect unapplied fixes from every iteration (modify_interface / - # add_interface requests that have no auto-handler). These block - # passed=true because they represent acknowledged-but-unresolved - # architectural issues. + # Collect unapplied fixes from every iteration and classify them: + # * advisory — ``modify_interface`` requests, which by design have + # no auto-handler. They are architectural suggestions for manual + # follow-up and do NOT gate the verdict. + # * blocking — an ``add_dependency`` with unresolved callees or a + # rejected ``add_interface``: wiring the pipeline could not + # install. These gate ``passed``. unapplied_fixes: List[Dict[str, Any]] = [] for entry in review_history: stats = entry.get("fix_stats") or {} for u in stats.get("unapplied", []): unapplied_fixes.append({**u, "iteration": entry.get("iteration")}) + advisory_fixes = [ + u for u in unapplied_fixes if u.get("action") == "modify_interface" + ] + blocking_unapplied_fixes = [ + u for u in unapplied_fixes if u.get("action") != "modify_interface" + ] + last_llm_pass = ( review_history[-1]["llm_review"].get("pass", False) if review_history else False @@ -663,7 +871,7 @@ def review_and_fix( code_passed = ( len(final_orphan_units) == 0 and len(final_feature_orphans) == 0 - and len(unapplied_fixes) == 0 + and len(blocking_unapplied_fixes) == 0 ) final_result = { @@ -672,6 +880,8 @@ def review_and_fix( "final_feature_orphans": final_feature_orphans, "final_orphan_units": final_orphan_units, "unapplied_fixes": unapplied_fixes, + "advisory_fixes": advisory_fixes, + "blocking_unapplied_fixes": blocking_unapplied_fixes, "iterations_run": len(review_history), # ``last_llm_pass`` is a snapshot taken BEFORE the LLM's own # iteration-N fixes are applied, so it can read FAIL even when @@ -788,7 +998,26 @@ def _run_llm_review( Please perform the review tasks and return the JSON result. """.strip() - combined_prompt = f"{GLOBAL_INTERFACE_REVIEW_PROMPT}\n\n{user_prompt}" + # Non-Python projects cannot use the add_interface auto-handler + # (stub synthesis is Python-only). Steer the LLM toward the + # language-agnostic actions so it does not waste a fix slot on a + # request that will be skipped. + language_note = "" + if self.backend.name != "python": + hints = self.backend.prompt_hints() + language_note = ( + f"\n\n## Target Language: {hints.display_name}\n" + f"This is a {hints.display_name} project, NOT Python. When " + "recommending fixes, use `add_dependency` (wire an existing " + "callee) or `modify_interface` (describe a manual change). " + "Do NOT use `add_interface`: automatic interface-stub " + "synthesis is only available for Python and will be skipped." + ) + + combined_prompt = ( + with_language_directive(GLOBAL_INTERFACE_REVIEW_PROMPT, self.backend) + + f"{language_note}\n\n{user_prompt}" + ) try: response = self.llm.generate( @@ -941,6 +1170,19 @@ def _apply_fixes( # else: empty calls_to_add — silently ignore (LLM bug, not actionable) elif action == "add_interface": + # Interface-stub synthesis is Python-only: it emits a + # ``def/class`` body with a docstring + ``pass``. For other + # languages we cannot materialise a syntactically valid stub, + # so skip the request without counting it as an unresolved + # blocker (the review still passes on structural grounds). + if self.backend.name != "python": + self.logger.info( + "[InterfaceReviewer] Skipping add_interface for " + "%s project (stub synthesis is Python-only): " + "%s::%s", + self.backend.name, file_path, unit_name, + ) + continue ok, reason, edges_added = self._apply_add_interface( fix=fix, interfaces_data=interfaces_data, diff --git a/CoderMind/scripts/func_design/interfaces_store.py b/CoderMind/scripts/func_design/interfaces_store.py index 2310cf5..5deb2ae 100644 --- a/CoderMind/scripts/func_design/interfaces_store.py +++ b/CoderMind/scripts/func_design/interfaces_store.py @@ -17,6 +17,8 @@ from pathlib import Path from typing import Dict, List, Optional, Set, Any, Union, Tuple +from decoder_lang.unit_kind import classify_unit_kind + logger = logging.getLogger(__name__) @@ -574,6 +576,12 @@ def find_orphan_units(self) -> List[str]: continue if unit.handler_added: continue # protected: handler-added is treated as required + # Type-like units (struct / enum / interface / ...) are + # referenced, not invoked, so a missing edge is not "dead + # code". Excluding them keeps the pruning detector aligned + # with the convergence gate (check_call_graph_connectivity). + if classify_unit_kind(unit.name) != "callable": + continue has_outgoing = key in outgoing and len(outgoing[key]) > 0 has_incoming = key in incoming and len(incoming[key]) > 0 if not has_outgoing and not has_incoming: diff --git a/CoderMind/scripts/init_codebase.py b/CoderMind/scripts/init_codebase.py index f01be28..e11a2f6 100644 --- a/CoderMind/scripts/init_codebase.py +++ b/CoderMind/scripts/init_codebase.py @@ -183,6 +183,14 @@ .claude """ +# Dev-env-only subset of the CoderMind block. Appended when a pre-existing +# ``.gitignore`` already carries ``.cmind/`` (so the full block is skipped) +# but predates the throwaway-venv rules. +_GITIGNORE_DEV_ENV_BLOCK = """# CoderMind dev environments (created by codegen pipeline) +.venv_dev/ +.cmind_dev_env/ +""" + # Kept for backward compatibility with any external import — equivalent to # the full ``.gitignore`` written for a brand-new project. GITIGNORE_CONTENT = _GITIGNORE_PYTHON_BLOCK + "\n" + _GITIGNORE_CMIND_BLOCK @@ -209,17 +217,32 @@ def _gitignore_has_cmind_block(existing: str) -> bool: return False +def _gitignore_has_dev_env(existing: str) -> bool: + """Heuristic: does an existing .gitignore already ignore ``.venv_dev/``? + + The codegen pipeline materializes a throwaway ``.venv_dev/`` virtual + environment inside each project. A fixture- or hand-authored + ``.gitignore`` can ship ``.cmind/`` without these dev-env rules, so we + detect them independently to avoid committing scratch venvs. + """ + for raw in existing.splitlines(): + line = raw.strip() + if not line or line.startswith("#"): + continue + if line in (".venv_dev", ".venv_dev/", "/.venv_dev", "/.venv_dev/"): + return True + return False + + # ============================================================================ # Agent Detection & Persistent Instructions # ============================================================================ # -# Removed: the -# previously-generated `repo/.claude/rules/cmind-codegen.md` and -# `repo/.github/instructions/cmind-codegen.instructions.md` files were -# auto-loaded by Claude Code / Copilot for **every** session, contaminating -# unrelated commands (rpg_edit, encode, plain Q&A) with codegen-only -# instructions. The recovery-after-/compact concern is already handled by -# `templates/commands/code_gen.md` itself, which the user re-invokes via +# Do not write persistent codegen instructions into the user's repository. +# Claude Code / Copilot auto-load those files for every session, which would +# contaminate unrelated commands (rpg_edit, encode, plain Q&A) with +# codegen-only instructions. The recovery-after-/compact concern is handled +# by `templates/commands/code_gen.md` itself, which the user re-invokes via # `/cmind.code_gen`. # # `cmind update` cleans up any stale `cmind-codegen.*` files left in older @@ -321,8 +344,9 @@ def create_gitignore(repo_path: Path, dry_run: bool = False) -> bool: has_python = _gitignore_has_python_block(existing) has_cmind = _gitignore_has_cmind_block(existing) + has_dev_env = _gitignore_has_dev_env(existing) - if has_python and has_cmind: + if has_python and has_cmind and has_dev_env: return False # Already fully configured additions = "" @@ -333,6 +357,14 @@ def create_gitignore(repo_path: Path, dry_run: bool = False) -> bool: if additions: additions += "\n" additions += _GITIGNORE_CMIND_BLOCK + elif not has_dev_env: + # The CoderMind block is present but predates the dev-env rules + # (e.g. a fixture-shipped .gitignore that only carried ``.cmind/``). + # Append just the dev-env venv ignores so codegen scratch venvs are + # never committed. + if additions: + additions += "\n" + additions += _GITIGNORE_DEV_ENV_BLOCK if not additions: return False diff --git a/CoderMind/scripts/lang_parser/__init__.py b/CoderMind/scripts/lang_parser/__init__.py new file mode 100644 index 0000000..213a609 --- /dev/null +++ b/CoderMind/scripts/lang_parser/__init__.py @@ -0,0 +1,35 @@ +from .base import BaseLanguageParser +from .models import LanguageConfig, LPCodeUnit, LPDependency, LPFileResult, NotSupported +from .registry import ( + detect_language, + dominant_language, + get_config, + get_config_for_path, + get_parser, + get_parser_for_file, + is_supported_source, + is_test_file, + markdown_fence_for_path, + parse_file, + validate_syntax, +) + +__all__ = [ + "BaseLanguageParser", + "LanguageConfig", + "LPCodeUnit", + "LPDependency", + "LPFileResult", + "NotSupported", + "detect_language", + "dominant_language", + "get_config", + "get_config_for_path", + "get_parser", + "get_parser_for_file", + "is_supported_source", + "is_test_file", + "markdown_fence_for_path", + "parse_file", + "validate_syntax", +] diff --git a/CoderMind/scripts/lang_parser/_c_family_parser.py b/CoderMind/scripts/lang_parser/_c_family_parser.py new file mode 100644 index 0000000..9bce62b --- /dev/null +++ b/CoderMind/scripts/lang_parser/_c_family_parser.py @@ -0,0 +1,479 @@ +from __future__ import annotations + +import re + +from .base import BaseLanguageParser +from .extractors.fallback import ( + block_end_for_braces, + delimiter_syntax_error, + dependency_from_import, + line_end_for_statement, + make_unit, + strip_string_literals, +) +from .models import LanguageConfig, LPCodeUnit, LPDependency, LPFileResult +from .tree_sitter_backend import TreeSitterBackend + + +_IDENTIFIER = r"[A-Za-z_]\w*" +_INCLUDE_QUOTE_RE = re.compile(r'^\s*#\s*include\s+"(?P[^"]+)"') +_INCLUDE_ANGLE_RE = re.compile(r"^\s*#\s*include\s+<(?P[^>]+)>") +_CLASS_LIKE_RE = re.compile( + rf"^\s*(?:template\s*<[^>]+>\s*)?(?:typedef\s+)?(?Pclass|struct)\s+" + rf"(?P{_IDENTIFIER})\b[^;{{}}]*\{{" +) +_METHOD_RE = re.compile( + rf"^\s*(?:(?:virtual|static|inline|constexpr|consteval|explicit|friend)\s+)*" + rf"(?:(?:[\w:<>~*&,.\[\]\s]+)\s+)?" + rf"(?P~?{_IDENTIFIER})\s*\([^;{{}}]*\)\s*" + rf"(?:const\s*)?(?:noexcept(?:\s*\([^)]*\))?\s*)?(?:override\s*)?(?:final\s*)?" + rf"(?:=\s*(?:0|default|delete)\s*)?(?::[^{{;]+)?\s*(?:\{{|;)" +) +_FUNCTION_DEF_RE = re.compile( + rf"^\s*(?:(?:extern\s+\"C\"\s+)?(?P(?:[\w:<>~*&,.\[\]\s]+)\s+))?" + rf"(?:(?P{_IDENTIFIER})::)?(?P~?{_IDENTIFIER})\s*\([^;{{}}]*\)\s*" + rf"(?:const\s*)?(?:noexcept(?:\s*\([^)]*\))?\s*)?(?:override\s*)?(?:final\s*)?" + rf"(?:->\s*[^{{;]+)?(?::[^{{;]+)?\s*\{{" +) +_FUNCTION_DECL_RE = re.compile( + rf"^\s*(?:(?:extern\s+\"C\"\s+)?(?P(?:[\w:<>~*&,\.\[\]\s]+)\s+))?" + rf"(?:(?P{_IDENTIFIER})::)?(?P~?{_IDENTIFIER})\s*\([^;{{}}]*\)\s*" + rf"(?:const\s*)?(?:noexcept(?:\s*\([^)]*\))?\s*)?(?:override\s*)?(?:final\s*)?" + rf"(?:->\s*[^{{;]+)?(?::[^{{;]+)?\s*;" +) +_STATIC_CALL_RE = re.compile(rf"(?{_IDENTIFIER})::(?P~?{_IDENTIFIER})\s*\(") +_NEW_EXPRESSION_RE = re.compile(rf"\bnew\s+(?P{_IDENTIFIER})\s*\(") +_DIRECT_CALL_RE = re.compile(rf"(?])(?P{_IDENTIFIER})\s*\(") +_ACCESS_SPECIFIER_RE = re.compile(r"^\s*(?:public|private|protected)\s*:\s*$") + +_C_OPENING_KEYWORDS = frozenset({ + "if", + "else", + "while", + "for", + "do", + "switch", + "try", + "catch", + "namespace", + "extern", +}) +_C_CALL_KEYWORDS = frozenset({ + "if", + "while", + "for", + "switch", + "return", + "sizeof", + "alignof", + "typeof", + "offsetof", + "new", + "delete", + "catch", + "throw", + "decltype", + "static_assert", +}) +_C_BUILTINS = frozenset({ + "printf", + "fprintf", + "sprintf", + "snprintf", + "malloc", + "calloc", + "realloc", + "free", + "memcpy", + "memmove", + "memset", + "strlen", + "strcpy", + "strncpy", + "strcmp", + "strncmp", + "strcat", + "puts", + "putchar", + "fopen", + "fclose", + "fread", + "fwrite", +}) + + +class CFamilyParser(BaseLanguageParser): + language = "c" + + def __init__(self, config: LanguageConfig): + self.config = config + self.backend = TreeSitterBackend(config.tree_sitter_language) + + def parse_file(self, path: str, source: str) -> LPFileResult: + lines = source.splitlines() + units: list[LPCodeUnit] = [] + dependencies: list[LPDependency] = [] + + units.extend(self._extract_includes(path, lines, dependencies)) + class_units, class_ranges = self._extract_class_like_units(path, lines) + units.extend(class_units) + units.extend(self._extract_functions(path, lines, class_ranges)) + dependencies.extend(self._extract_invokes(path, lines, units)) + + syntax_error = self._syntax_error(source) + return LPFileResult( + file_path=path, + language=self.language, + units=units, + dependencies=dependencies, + syntax_error=syntax_error, + ) + + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + syntax_error = self._syntax_error(source) + return (syntax_error is None, syntax_error) + + def _extract_includes( + self, + path: str, + lines: list[str], + dependencies: list[LPDependency], + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + for index, line in enumerate(lines): + match = _INCLUDE_QUOTE_RE.match(line) + include_style = "quote" + if match is None: + match = _INCLUDE_ANGLE_RE.match(line) + include_style = "angle" + if match is None: + continue + + include_path = match.group("path") + units.append( + make_unit( + name=include_path, + unit_type="import", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=index + 1, + language=self.language, + node_type="preproc_include", + extra={ + "module": include_path, + "import_path": include_path, + "include_style": include_style, + }, + ) + ) + dep = dependency_from_import( + path=path, + module=include_path, + symbol=include_path, + line=index + 1, + language=self.language, + import_kind=f"{self.language}_include", + ) + dep.extra.update({"include_style": include_style, "import_path": include_path}) + dependencies.append(dep) + return units + + def _extract_class_like_units(self, path: str, lines: list[str]) -> tuple[list[LPCodeUnit], list[tuple[int, int]]]: + units: list[LPCodeUnit] = [] + ranges: list[tuple[int, int]] = [] + index = 0 + while index < len(lines): + match = _CLASS_LIKE_RE.match(self._clean_line(lines[index])) + if match is None: + index += 1 + continue + + name = match.group("name") + kind = match.group("kind") + unit_type = "class" if kind == "class" else "struct" + end = block_end_for_braces(lines, index) + ranges.append((index, end)) + units.append( + make_unit( + name=name, + unit_type=unit_type, + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type=f"{kind}_specifier", + extra={"kind": kind}, + ) + ) + units.extend(self._extract_methods(path, lines, name, index, end)) + index = end + 1 + return units, ranges + + def _extract_methods( + self, + path: str, + lines: list[str], + class_name: str, + class_start: int, + class_end: int, + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + depth = self._brace_delta(lines[class_start]) + index = class_start + 1 + while index < class_end: + line = lines[index] + clean = self._clean_line(line) + if depth == 1 and not _ACCESS_SPECIFIER_RE.match(clean): + match = _METHOD_RE.match(clean) + if match is not None: + method_end = block_end_for_braces(lines, index) if "{" in clean else line_end_for_statement(lines, index) + units.append( + make_unit( + name=match.group("name"), + unit_type="method", + file_path=path, + parent=class_name, + lines=lines, + line_start=index + 1, + line_end=method_end + 1, + language=self.language, + node_type="method_definition" if "{" in clean else "method_declaration", + ) + ) + depth += self._brace_delta(line) + if depth < 0: + depth = 0 + index += 1 + return units + + def _extract_functions( + self, + path: str, + lines: list[str], + excluded_ranges: list[tuple[int, int]], + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + index = 0 + while index < len(lines): + if self._in_ranges(index, excluded_ranges): + index += 1 + continue + + match, signature_end, has_body = self._match_function(lines, index) + if match is None: + index += 1 + continue + + name = match.group("name") + parent = match.group("parent") + if name in _C_OPENING_KEYWORDS or (not parent and not match.group("prefix")): + index += 1 + continue + + end = block_end_for_braces(lines, signature_end) if has_body else line_end_for_statement(lines, signature_end) + unit_type = "method" if parent else "function" + extra = {"qualified_parent": parent} if parent else None + units.append( + make_unit( + name=name, + unit_type=unit_type, + file_path=path, + parent=parent, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type=( + "method_definition" if parent and has_body + else "method_declaration" if parent + else "function_definition" if has_body + else "function_declaration" + ), + extra=extra, + ) + ) + index = end + 1 + return units + + def _match_function( + self, + lines: list[str], + start_index: int, + ) -> tuple[re.Match[str] | None, int, bool]: + if not self._clean_line(lines[start_index]).strip(): + return None, start_index, False + + statement_parts: list[str] = [] + max_end = min(len(lines), start_index + 8) + for end_index in range(start_index, max_end): + clean = self._clean_line(lines[end_index]).strip() + if not clean: + continue + statement_parts.append(clean) + statement = " ".join(statement_parts) + open_index = statement.find("{") + semi_index = statement.find(";") + if semi_index != -1 and (open_index == -1 or semi_index < open_index): + match = _FUNCTION_DECL_RE.match(statement) + if match is None: + return None, start_index, False + return match, end_index, False + if open_index == -1: + continue + match = _FUNCTION_DEF_RE.match(statement) + if match is None: + return None, start_index, False + return match, end_index, True + return None, start_index, False + + def _extract_invokes(self, path: str, lines: list[str], units: list[LPCodeUnit]) -> list[LPDependency]: + import_ranges = [ + (unit.line_start, unit.line_end) + for unit in units + if unit.unit_type == "import" and unit.line_start is not None and unit.line_end is not None + ] + unit_start_lines = { + unit.line_start + for unit in units + if unit.unit_type in {"class", "struct", "function", "method"} and unit.line_start is not None + } + dependencies: list[LPDependency] = [] + seen: set[tuple[str, str, int, str, str | None]] = set() + + for line_number, line in enumerate(lines, start=1): + if any(start <= line_number <= end for start, end in import_ranges): + continue + clean = self._clean_line(line) + if line_number in unit_start_lines: + clean = clean.split("{", 1)[1] if "{" in clean else "" + if not clean: + continue + source_ref = self._source_reference_for_line(path, units, line_number) + + static_spans: list[tuple[int, int]] = [] + for match in _STATIC_CALL_RE.finditer(clean): + qualifier = match.group("qualifier") + name = match.group("name") + static_spans.append(match.span("name")) + if name in _C_CALL_KEYWORDS: + continue + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "static", + qualifier, + ) + + constructor_spans: list[tuple[int, int]] = [] + for match in _NEW_EXPRESSION_RE.finditer(clean): + name = match.group("name") + constructor_spans.append(match.span("name")) + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "constructor", + None, + ) + + for match in _DIRECT_CALL_RE.finditer(clean): + name = match.group("name") + if name in _C_CALL_KEYWORDS or name in _C_BUILTINS: + continue + if any(start <= match.start("name") < end for start, end in static_spans + constructor_spans): + continue + if self._is_declaration_call_context(clean, match.start("name")): + continue + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "direct", + None, + ) + return dependencies + + def _append_invoke_dependency( + self, + dependencies: list[LPDependency], + seen: set[tuple[str, str, int, str, str | None]], + source_ref: str, + name: str, + line_number: int, + call_kind: str, + qualifier: str | None, + ) -> None: + key = (source_ref, name, line_number, call_kind, qualifier) + if key in seen: + return + seen.add(key) + extra = {"language": self.language, "call_kind": call_kind} + if qualifier: + extra["qualifier"] = qualifier + dependencies.append( + LPDependency( + src=source_ref, + dst=qualifier or name, + relation="invokes", + symbol=name, + line=line_number, + confidence="high", + extra=extra, + ) + ) + + def _source_reference_for_line(self, path: str, units: list[LPCodeUnit], line_number: int) -> str: + candidates = [ + unit + for unit in units + if unit.unit_type in {"function", "method"} + and unit.line_start is not None + and unit.line_end is not None + and unit.line_start <= line_number <= unit.line_end + ] + if not candidates: + return path + candidates.sort(key=lambda unit: (unit.line_end or line_number) - (unit.line_start or line_number)) + unit = candidates[0] + if unit.parent and unit.name: + return f"{path}:{unit.parent}.{unit.name}" + if unit.name: + return f"{path}:{unit.name}" + return path + + def _is_declaration_call_context(self, clean_line: str, match_start: int) -> bool: + prefix = clean_line[:match_start].rstrip() + if prefix.endswith(("#define", "typedef")): + return True + if re.search(r"(?:^|\b)(?:class|struct|enum|if|for|while|switch|catch)\s*$", prefix): + return True + return False + + def _syntax_error(self, source: str) -> str | None: + backend_result = self.backend.validate_syntax(source) + if backend_result is not None: + valid, error = backend_result + if not valid: + return error + return delimiter_syntax_error(source) + + def _clean_line(self, line: str) -> str: + return strip_string_literals(line).split("//", 1)[0] + + def _brace_delta(self, line: str) -> int: + clean = self._clean_line(line) + return clean.count("{") - clean.count("}") + + def _in_ranges(self, index: int, ranges: list[tuple[int, int]]) -> bool: + return any(start <= index <= end for start, end in ranges) diff --git a/CoderMind/scripts/lang_parser/_ecmascript_parser.py b/CoderMind/scripts/lang_parser/_ecmascript_parser.py new file mode 100644 index 0000000..c919f73 --- /dev/null +++ b/CoderMind/scripts/lang_parser/_ecmascript_parser.py @@ -0,0 +1,448 @@ +from __future__ import annotations + +import re + +from .base import BaseLanguageParser +from .extractors.fallback import ( + block_end_for_braces, + delimiter_syntax_error, + dependency_from_import, + line_end_for_statement, + make_unit, + strip_string_literals, +) +from .models import LanguageConfig, LPCodeUnit, LPDependency, LPFileResult +from .tree_sitter_backend import TreeSitterBackend + + +_IDENTIFIER = r"[A-Za-z_$][\w$]*" +_IMPORT_RE = re.compile(r"^\s*import\b") +_EXPORT_FROM_RE = re.compile(r"^\s*export\b[\s\S]*\bfrom\s+['\"]([^'\"]+)['\"]") +_IMPORT_FROM_RE = re.compile(r"\bfrom\s+['\"]([^'\"]+)['\"]") +_IMPORT_SIDE_EFFECT_RE = re.compile(r"^\s*import\s+['\"]([^'\"]+)['\"]") +_DEFAULT_EXPORT_RE = re.compile(r"^\s*export\s+default\b") +_CLASS_RE = re.compile( + rf"^\s*(?:export\s+default\s+|export\s+)?(?:abstract\s+)?class\s+(?P{_IDENTIFIER})\b" +) +_METHOD_RE = re.compile( + rf"^\s*(?:(?:public|private|protected|static|async|override|readonly|abstract)\s+)*" + rf"(?:get\s+|set\s+)?(?Pconstructor|#?{_IDENTIFIER})\s*" + rf"(?:<[^>{{}}]+>)?\([^)]*\)\s*(?::[^={{;]+)?(?:\{{|;)" +) +_FIELD_METHOD_RE = re.compile( + rf"^\s*(?:(?:public|private|protected|static|async|override|readonly)\s+)*" + rf"(?P#?{_IDENTIFIER})\s*=\s*(?:async\s*)?(?:\([^)]*\)|{_IDENTIFIER})\s*" + rf"(?::[^=]+)?=>\s*\{{" +) +_FUNCTION_RE = re.compile( + rf"^\s*(?:export\s+default\s+|export\s+)?(?:async\s+)?function\s+(?P{_IDENTIFIER})\s*\(" +) +_ARROW_FUNCTION_RE = re.compile( + rf"^\s*(?:export\s+)?(?:const|let|var)\s+(?P{_IDENTIFIER})\s*=\s*" + rf"(?:async\s*)?(?:\([^)]*\)|{_IDENTIFIER})\s*(?::[^=]+)?=>" +) +_COMMONJS_FUNCTION_RE = re.compile( + rf"^\s*(?:module\.exports\.|exports\.)(?P{_IDENTIFIER})\s*=\s*(?:async\s+)?function\b" +) +_NEW_EXPRESSION_RE = re.compile(rf"\bnew\s+(?P{_IDENTIFIER})\s*(?:<[^>]+>)?\(") +_DIRECT_CALL_RE = re.compile(rf"(?{_IDENTIFIER})\s*(?:<[^>]+>)?\(") +_CALL_KEYWORDS = frozenset({ + "if", "for", "while", "switch", "catch", "function", "return", "typeof", + "void", "delete", "await", "new", "super", "import", "require", +}) +_DECLARATION_PREFIX_RE = re.compile( + r"(?:^|\b)(?:function|class|interface|if|for|while|switch|catch|with|const|let|var)\s*$" +) + + +class ECMAScriptParser(BaseLanguageParser): + language = "javascript" + + def __init__(self, config: LanguageConfig): + self.config = config + self.backend = TreeSitterBackend(config.tree_sitter_language) + + def parse_file(self, path: str, source: str) -> LPFileResult: + lines = source.splitlines() + units: list[LPCodeUnit] = [] + dependencies: list[LPDependency] = [] + + units.extend(self._extract_imports(path, lines, dependencies)) + class_units, class_ranges = self._extract_classes(path, lines) + units.extend(class_units) + units.extend(self._extract_functions(path, lines, class_ranges)) + dependencies.extend(self._extract_invokes(path, lines, units)) + + syntax_error = self._syntax_error(source) + return LPFileResult( + file_path=path, + language=self.language, + units=units, + dependencies=dependencies, + syntax_error=syntax_error, + ) + + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + syntax_error = self._syntax_error(source) + return (syntax_error is None, syntax_error) + + def _extract_imports( + self, + path: str, + lines: list[str], + dependencies: list[LPDependency], + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + index = 0 + while index < len(lines): + line = lines[index] + if not (_IMPORT_RE.match(line) or self._looks_like_export_from_start(line)): + index += 1 + continue + + end = self._line_end_for_import_export(lines, index) + statement = "\n".join(lines[index:end + 1]) + module = self._module_from_import(statement) + if module is None: + index += 1 + continue + + bindings = self._import_bindings_from_statement(statement, module) + extra = {"module": module} + if bindings: + extra["bindings"] = bindings + units.append( + make_unit( + name=module or statement.strip(), + unit_type="import", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="import_statement", + extra=extra, + ) + ) + dep = dependency_from_import( + path=path, + module=module, + symbol=module, + line=index + 1, + language=self.language, + import_kind=f"{self.language}_import", + ) + if bindings: + dep.extra["bindings"] = bindings + dependencies.append(dep) + index = end + 1 + return units + + def _looks_like_export_from_start(self, line: str) -> bool: + stripped = line.lstrip() + return ( + stripped.startswith("export {") + or stripped.startswith("export type {") + or stripped.startswith("export *") + or _EXPORT_FROM_RE.match(line) is not None + ) + + def _line_end_for_import_export(self, lines: list[str], start_index: int) -> int: + for index in range(start_index, len(lines)): + statement = "\n".join(lines[start_index:index + 1]) + if self._module_from_import(statement) is not None: + return index + if lines[index].strip().endswith(";"): + return index + return len(lines) - 1 + + def _import_bindings_from_statement(self, statement: str, module: str) -> dict[str, dict[str, str]]: + stripped = statement.strip() + if not _IMPORT_RE.match(stripped) or _IMPORT_SIDE_EFFECT_RE.match(stripped): + return {} + if re.match(r"^import\s+type\b", stripped): + return {} + + before_from = re.split(r"\bfrom\b", stripped, maxsplit=1)[0] + before_from = re.sub(r"^import\s+", "", before_from, count=1).strip() + bindings: dict[str, dict[str, str]] = {} + + namespace_match = re.match(rf"\*\s+as\s+(?P{_IDENTIFIER})$", before_from) + if namespace_match: + local = namespace_match.group("local") + bindings[local] = {"module": module, "imported": "*", "kind": "namespace"} + return bindings + + named_match = re.search(r"\{(?P[\s\S]*)\}", before_from) + default_part = before_from + if named_match: + default_part = before_from[:named_match.start()].strip().rstrip(",") + for item in named_match.group("body").split(","): + item = item.strip() + if not item or item.startswith("type "): + continue + if " as " in item: + imported, local = [part.strip() for part in item.split(" as ", 1)] + else: + imported = local = item + if re.fullmatch(_IDENTIFIER, imported) and re.fullmatch(_IDENTIFIER, local): + bindings[local] = {"module": module, "imported": imported, "kind": "named"} + + default_part = default_part.strip() + if default_part: + default_name = default_part.split(",", 1)[0].strip() + if re.fullmatch(_IDENTIFIER, default_name): + bindings[default_name] = {"module": module, "imported": default_name, "kind": "default"} + + return bindings + + def _extract_classes(self, path: str, lines: list[str]) -> tuple[list[LPCodeUnit], list[tuple[int, int]]]: + units: list[LPCodeUnit] = [] + ranges: list[tuple[int, int]] = [] + index = 0 + while index < len(lines): + match = _CLASS_RE.match(lines[index]) + if not match: + index += 1 + continue + + class_name = match.group("name") + end = block_end_for_braces(lines, index) + ranges.append((index, end)) + extra = {"export_default": True} if _DEFAULT_EXPORT_RE.match(lines[index]) else None + units.append( + make_unit( + name=class_name, + unit_type="class", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="class_declaration", + extra=extra, + ) + ) + units.extend(self._extract_methods(path, lines, class_name, index, end)) + index = end + 1 + return units, ranges + + def _extract_methods( + self, + path: str, + lines: list[str], + class_name: str, + class_start: int, + class_end: int, + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + depth = strip_string_literals(lines[class_start]).count("{") - strip_string_literals(lines[class_start]).count("}") + index = class_start + 1 + while index < class_end: + line = lines[index] + clean = strip_string_literals(line) + if depth == 1: + match = _METHOD_RE.match(line) or _FIELD_METHOD_RE.match(line) + if match: + method_end = block_end_for_braces(lines, index) if "{" in clean else line_end_for_statement(lines, index) + units.append( + make_unit( + name=match.group("name"), + unit_type="method", + file_path=path, + parent=class_name, + lines=lines, + line_start=index + 1, + line_end=method_end + 1, + language=self.language, + node_type="method_definition", + ) + ) + depth += clean.count("{") - clean.count("}") + index += 1 + return units + + def _extract_functions( + self, + path: str, + lines: list[str], + excluded_ranges: list[tuple[int, int]], + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + depth = 0 + index = 0 + while index < len(lines): + if any(start <= index <= end for start, end in excluded_ranges): + index += 1 + continue + + line = lines[index] + clean = strip_string_literals(line) + if depth == 0: + match = _FUNCTION_RE.match(line) or _ARROW_FUNCTION_RE.match(line) or _COMMONJS_FUNCTION_RE.match(line) + if match: + end = block_end_for_braces(lines, index) if "{" in clean else line_end_for_statement(lines, index) + extra = {"export_default": True} if _DEFAULT_EXPORT_RE.match(line) else None + units.append( + make_unit( + name=match.group("name"), + unit_type="function", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="function_declaration", + extra=extra, + ) + ) + index = end + 1 + continue + depth += clean.count("{") - clean.count("}") + if depth < 0: + depth = 0 + index += 1 + return units + + def _extract_invokes( + self, + path: str, + lines: list[str], + units: list[LPCodeUnit], + ) -> list[LPDependency]: + local_symbols = { + unit.name + for unit in units + if unit.unit_type in {"class", "function"} and unit.name + } + import_bindings: dict[str, dict[str, str]] = {} + import_ranges: list[tuple[int, int]] = [] + for unit in units: + if unit.unit_type != "import": + continue + if unit.line_start is not None and unit.line_end is not None: + import_ranges.append((unit.line_start, unit.line_end)) + for local, binding in unit.extra.get("bindings", {}).items(): + import_bindings[local] = dict(binding) + + dependencies: list[LPDependency] = [] + seen: set[tuple[str, str, int, str]] = set() + for line_number, line in enumerate(lines, start=1): + if any(start <= line_number <= end for start, end in import_ranges): + continue + clean = strip_string_literals(line).split("//", 1)[0] + source_ref = self._source_reference_for_line(path, units, line_number) + + constructor_spans: list[tuple[int, int]] = [] + for match in _NEW_EXPRESSION_RE.finditer(clean): + name = match.group("name") + constructor_spans.append(match.span("name")) + if name in local_symbols or name in import_bindings: + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "constructor", + import_bindings.get(name), + ) + + for match in _DIRECT_CALL_RE.finditer(clean): + name = match.group("name") + if name in _CALL_KEYWORDS: + continue + if any(start <= match.start("name") < end for start, end in constructor_spans): + continue + if self._is_declaration_call_context(clean, match.start("name")): + continue + if name in local_symbols or name in import_bindings: + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "function", + import_bindings.get(name), + ) + return dependencies + + def _append_invoke_dependency( + self, + dependencies: list[LPDependency], + seen: set[tuple[str, str, int, str]], + source_ref: str, + name: str, + line_number: int, + call_kind: str, + import_binding: dict[str, str] | None, + ) -> None: + key = (source_ref, name, line_number, call_kind) + if key in seen: + return + seen.add(key) + extra = {"language": self.language, "call_kind": call_kind, "local": name} + dst = name + if import_binding: + extra.update(import_binding) + dst = import_binding["module"] + dependencies.append( + LPDependency( + src=source_ref, + dst=dst, + relation="invokes", + symbol=name, + line=line_number, + confidence="high", + extra=extra, + ) + ) + + def _source_reference_for_line(self, path: str, units: list[LPCodeUnit], line_number: int) -> str: + candidates = [ + unit + for unit in units + if unit.unit_type in {"function", "method", "class"} + and unit.line_start is not None + and unit.line_end is not None + and unit.line_start <= line_number <= unit.line_end + ] + if not candidates: + return path + candidates.sort(key=lambda unit: (unit.line_end or line_number) - (unit.line_start or line_number)) + unit = candidates[0] + if unit.parent and unit.name: + return f"{path}:{unit.parent}.{unit.name}" + if unit.name: + return f"{path}:{unit.name}" + return path + + def _is_declaration_call_context(self, clean_line: str, match_start: int) -> bool: + prefix = clean_line[:match_start].rstrip() + if prefix.endswith("new"): + return True + return _DECLARATION_PREFIX_RE.search(prefix) is not None + + def _module_from_import(self, statement: str) -> str | None: + side_effect = _IMPORT_SIDE_EFFECT_RE.search(statement) + if side_effect: + return side_effect.group(1) + import_from = _IMPORT_FROM_RE.search(statement) + if import_from: + return import_from.group(1) + export_from = _EXPORT_FROM_RE.search(statement) + if export_from: + return export_from.group(1) + return None + + def _syntax_error(self, source: str) -> str | None: + backend_result = self.backend.validate_syntax(source) + if backend_result is not None: + valid, error = backend_result + if not valid: + return error + return delimiter_syntax_error(source) diff --git a/CoderMind/scripts/lang_parser/base.py b/CoderMind/scripts/lang_parser/base.py new file mode 100644 index 0000000..b8d62e4 --- /dev/null +++ b/CoderMind/scripts/lang_parser/base.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from .models import LPFileResult + + +class BaseLanguageParser(ABC): + @abstractmethod + def parse_file(self, path: str, source: str) -> LPFileResult: + raise NotImplementedError + + @abstractmethod + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + raise NotImplementedError diff --git a/CoderMind/scripts/lang_parser/c_parser.py b/CoderMind/scripts/lang_parser/c_parser.py new file mode 100644 index 0000000..e56e2b7 --- /dev/null +++ b/CoderMind/scripts/lang_parser/c_parser.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from ._c_family_parser import CFamilyParser +from .config.c import C_CONFIG + + +class CParser(CFamilyParser): + language = "c" + + def __init__(self) -> None: + super().__init__(C_CONFIG) diff --git a/CoderMind/scripts/lang_parser/config/__init__.py b/CoderMind/scripts/lang_parser/config/__init__.py new file mode 100644 index 0000000..a4a77ef --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/__init__.py @@ -0,0 +1,17 @@ +from .c import C_CONFIG +from .cpp import CPP_CONFIG +from .go import GO_CONFIG +from .javascript import JAVASCRIPT_CONFIG +from .python import PYTHON_CONFIG +from .rust import RUST_CONFIG +from .typescript import TYPESCRIPT_CONFIG + +__all__ = [ + "C_CONFIG", + "CPP_CONFIG", + "GO_CONFIG", + "JAVASCRIPT_CONFIG", + "PYTHON_CONFIG", + "RUST_CONFIG", + "TYPESCRIPT_CONFIG", +] diff --git a/CoderMind/scripts/lang_parser/config/c.py b/CoderMind/scripts/lang_parser/config/c.py new file mode 100644 index 0000000..eecc2fe --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/c.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +_C_TEST_GLOBS = ( + "*_test.c", + "**/*_test.c", + "test_*.c", + "**/test_*.c", + "tests/*.c", + "tests/**/*.c", + "test/*.c", + "test/**/*.c", + "**/tests/*.c", + "**/tests/**/*.c", + "**/test/*.c", + "**/test/**/*.c", +) + + +C_CONFIG = LanguageConfig( + name="c", + display_name="C", + extensions=(".c", ".h"), + markdown_fence="c", + source_globs=("*.c", "*.h", "**/*.c", "**/*.h"), + test_globs=_C_TEST_GLOBS, + tree_sitter_language="c", + class_node_types=("struct_specifier",), + function_node_types=("function_definition",), + method_node_types=(), + import_node_types=("preproc_include",), + module_path_style="c", + dependency_files=("CMakeLists.txt", "Makefile", "compile_commands.json"), + entrypoint_candidates=("main.c", "src/main.c"), +) diff --git a/CoderMind/scripts/lang_parser/config/cpp.py b/CoderMind/scripts/lang_parser/config/cpp.py new file mode 100644 index 0000000..9d26ed4 --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/cpp.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +_CPP_TEST_GLOBS = ( + "*_test.cpp", + "**/*_test.cpp", + "test_*.cpp", + "**/test_*.cpp", + "*_test.cc", + "**/*_test.cc", + "test_*.cc", + "**/test_*.cc", + "*_test.cxx", + "**/*_test.cxx", + "test_*.cxx", + "**/test_*.cxx", + "tests/*.cpp", + "tests/**/*.cpp", + "tests/*.cc", + "tests/**/*.cc", + "tests/*.cxx", + "tests/**/*.cxx", + "test/*.cpp", + "test/**/*.cpp", + "test/*.cc", + "test/**/*.cc", + "test/*.cxx", + "test/**/*.cxx", + "**/tests/*.cpp", + "**/tests/**/*.cpp", + "**/tests/*.cc", + "**/tests/**/*.cc", + "**/tests/*.cxx", + "**/tests/**/*.cxx", + "**/test/*.cpp", + "**/test/**/*.cpp", + "**/test/*.cc", + "**/test/**/*.cc", + "**/test/*.cxx", + "**/test/**/*.cxx", +) + + +CPP_CONFIG = LanguageConfig( + name="cpp", + display_name="C++", + extensions=(".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx"), + markdown_fence="cpp", + source_globs=( + "*.cpp", + "**/*.cpp", + "*.cc", + "**/*.cc", + "*.cxx", + "**/*.cxx", + "*.hpp", + "**/*.hpp", + "*.hh", + "**/*.hh", + "*.hxx", + "**/*.hxx", + ), + test_globs=_CPP_TEST_GLOBS, + tree_sitter_language="cpp", + class_node_types=("class_specifier", "struct_specifier"), + function_node_types=("function_definition",), + method_node_types=("function_definition",), + import_node_types=("preproc_include",), + module_path_style="c", + dependency_files=("CMakeLists.txt", "Makefile", "compile_commands.json", "CMakePresets.json"), + entrypoint_candidates=("main.cpp", "src/main.cpp", "main.cc", "src/main.cc"), +) diff --git a/CoderMind/scripts/lang_parser/config/go.py b/CoderMind/scripts/lang_parser/config/go.py new file mode 100644 index 0000000..c005d94 --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/go.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +GO_CONFIG = LanguageConfig( + name="go", + display_name="Go", + extensions=(".go",), + markdown_fence="go", + source_globs=("*.go", "**/*.go"), + test_globs=( + "*_test.go", + "**/*_test.go", + "tests/*.go", + "tests/**/*.go", + "test/*.go", + "test/**/*.go", + "**/tests/*.go", + "**/tests/**/*.go", + "**/test/*.go", + "**/test/**/*.go", + ), + tree_sitter_language="go", + class_node_types=("type_declaration", "struct_type", "interface_type"), + function_node_types=("function_declaration",), + method_node_types=("method_declaration",), + import_node_types=("import_declaration", "import_spec"), + module_path_style="go", + default_test_command=("go", "test", "./..."), + dependency_files=("go.mod", "go.sum"), + entrypoint_candidates=("main.go", "cmd/main.go"), +) diff --git a/CoderMind/scripts/lang_parser/config/javascript.py b/CoderMind/scripts/lang_parser/config/javascript.py new file mode 100644 index 0000000..ee126d6 --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/javascript.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +_JAVASCRIPT_TEST_GLOBS = ( + "*.test.js", + "*.spec.js", + "*.test.jsx", + "*.spec.jsx", + "**/*.test.js", + "**/*.spec.js", + "**/*.test.jsx", + "**/*.spec.jsx", + "tests/*.js", + "tests/**/*.js", + "tests/*.jsx", + "tests/**/*.jsx", + "test/*.js", + "test/**/*.js", + "test/*.jsx", + "test/**/*.jsx", + "__tests__/*.js", + "__tests__/**/*.js", + "__tests__/*.jsx", + "__tests__/**/*.jsx", + "**/tests/*.js", + "**/tests/**/*.js", + "**/tests/*.jsx", + "**/tests/**/*.jsx", + "**/test/*.js", + "**/test/**/*.js", + "**/test/*.jsx", + "**/test/**/*.jsx", + "**/__tests__/*.js", + "**/__tests__/**/*.js", + "**/__tests__/*.jsx", + "**/__tests__/**/*.jsx", +) + + +JAVASCRIPT_CONFIG = LanguageConfig( + name="javascript", + display_name="JavaScript", + extensions=(".js", ".jsx"), + markdown_fence="javascript", + source_globs=("*.js", "*.jsx", "**/*.js", "**/*.jsx"), + test_globs=_JAVASCRIPT_TEST_GLOBS, + tree_sitter_language="javascript", + class_node_types=("class_declaration",), + function_node_types=("function_declaration", "lexical_declaration", "variable_declaration"), + method_node_types=("method_definition", "public_field_definition"), + import_node_types=("import_statement", "import_clause"), + module_path_style="node", + default_test_command=("npm", "test"), + dependency_files=("package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml"), + entrypoint_candidates=("src/index.js", "src/main.js", "index.js", "main.js"), +) diff --git a/CoderMind/scripts/lang_parser/config/python.py b/CoderMind/scripts/lang_parser/config/python.py new file mode 100644 index 0000000..373d587 --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/python.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from ..models import LanguageConfig + +PYTHON_CONFIG = LanguageConfig( + name="python", + display_name="Python", + extensions=(".py",), + markdown_fence="python", + source_globs=("*.py", "**/*.py"), + test_globs=( + "test_*.py", + "*_test.py", + "tests/*.py", + "tests/**/*.py", + "test/*.py", + "test/**/*.py", + "testing/*.py", + "testing/**/*.py", + "**/test_*.py", + "**/*_test.py", + "**/tests/*.py", + "**/tests/**/*.py", + "**/test/*.py", + "**/test/**/*.py", + "**/testing/*.py", + "**/testing/**/*.py", + ), + tree_sitter_language=None, + class_node_types=("ClassDef",), + function_node_types=("FunctionDef", "AsyncFunctionDef"), + method_node_types=("FunctionDef", "AsyncFunctionDef"), + import_node_types=("Import", "ImportFrom"), + module_path_style="python", + default_test_command=("uv", "run", "pytest"), + dependency_files=("requirements.txt", "pyproject.toml", "setup.py", "setup.cfg"), + entrypoint_candidates=("main.py", "app.py", "__main__.py"), +) diff --git a/CoderMind/scripts/lang_parser/config/rust.py b/CoderMind/scripts/lang_parser/config/rust.py new file mode 100644 index 0000000..8628e22 --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/rust.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +RUST_CONFIG = LanguageConfig( + name="rust", + display_name="Rust", + extensions=(".rs",), + markdown_fence="rust", + source_globs=("*.rs", "**/*.rs"), + test_globs=( + "tests/*.rs", + "tests/**/*.rs", + "test/*.rs", + "test/**/*.rs", + "**/tests/*.rs", + "**/tests/**/*.rs", + "**/test/*.rs", + "**/test/**/*.rs", + "benches/*.rs", + "benches/**/*.rs", + "examples/*.rs", + "examples/**/*.rs", + ), + tree_sitter_language="rust", + class_node_types=("struct_item", "enum_item"), + function_node_types=("function_item",), + method_node_types=("function_item",), + import_node_types=("use_declaration", "mod_item"), + module_path_style="rust", + default_test_command=("cargo", "test"), + dependency_files=("Cargo.toml", "Cargo.lock"), + entrypoint_candidates=("src/main.rs", "src/lib.rs"), +) diff --git a/CoderMind/scripts/lang_parser/config/typescript.py b/CoderMind/scripts/lang_parser/config/typescript.py new file mode 100644 index 0000000..b77e46c --- /dev/null +++ b/CoderMind/scripts/lang_parser/config/typescript.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from ..models import LanguageConfig + + +_TYPESCRIPT_TEST_GLOBS = ( + "*.test.ts", + "*.spec.ts", + "*.test.tsx", + "*.spec.tsx", + "**/*.test.ts", + "**/*.spec.ts", + "**/*.test.tsx", + "**/*.spec.tsx", + "tests/*.ts", + "tests/**/*.ts", + "tests/*.tsx", + "tests/**/*.tsx", + "test/*.ts", + "test/**/*.ts", + "test/*.tsx", + "test/**/*.tsx", + "__tests__/*.ts", + "__tests__/**/*.ts", + "__tests__/*.tsx", + "__tests__/**/*.tsx", + "**/tests/*.ts", + "**/tests/**/*.ts", + "**/tests/*.tsx", + "**/tests/**/*.tsx", + "**/test/*.ts", + "**/test/**/*.ts", + "**/test/*.tsx", + "**/test/**/*.tsx", + "**/__tests__/*.ts", + "**/__tests__/**/*.ts", + "**/__tests__/*.tsx", + "**/__tests__/**/*.tsx", +) + + +TYPESCRIPT_CONFIG = LanguageConfig( + name="typescript", + display_name="TypeScript", + extensions=(".ts", ".tsx"), + markdown_fence="typescript", + source_globs=("*.ts", "*.tsx", "**/*.ts", "**/*.tsx"), + test_globs=_TYPESCRIPT_TEST_GLOBS, + tree_sitter_language="typescript", + class_node_types=("class_declaration", "interface_declaration", "type_alias_declaration"), + function_node_types=("function_declaration", "lexical_declaration", "variable_declaration"), + method_node_types=("method_definition", "public_field_definition"), + import_node_types=("import_statement", "import_clause"), + module_path_style="node", + default_test_command=("npm", "test"), + dependency_files=("package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "tsconfig.json"), + entrypoint_candidates=("src/index.ts", "src/main.ts", "index.ts", "main.ts"), +) diff --git a/CoderMind/scripts/lang_parser/cpp_parser.py b/CoderMind/scripts/lang_parser/cpp_parser.py new file mode 100644 index 0000000..b89b6d9 --- /dev/null +++ b/CoderMind/scripts/lang_parser/cpp_parser.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from ._c_family_parser import CFamilyParser +from .config.cpp import CPP_CONFIG + + +class CppParser(CFamilyParser): + language = "cpp" + + def __init__(self) -> None: + super().__init__(CPP_CONFIG) diff --git a/CoderMind/scripts/lang_parser/extractors/__init__.py b/CoderMind/scripts/lang_parser/extractors/__init__.py new file mode 100644 index 0000000..e7cd536 --- /dev/null +++ b/CoderMind/scripts/lang_parser/extractors/__init__.py @@ -0,0 +1 @@ +"""Language-specific extraction helpers.""" diff --git a/CoderMind/scripts/lang_parser/extractors/fallback.py b/CoderMind/scripts/lang_parser/extractors/fallback.py new file mode 100644 index 0000000..248698d --- /dev/null +++ b/CoderMind/scripts/lang_parser/extractors/fallback.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +import re +from typing import Iterable + +from ..models import LPCodeUnit, LPDependency + + +_IDENTIFIER = r"[A-Za-z_$][\w$]*" + + +def strip_comments(line: str) -> str: + text = line.strip() + if text.startswith("//") or text.startswith("#"): + return "" + return line + + +def source_slice(lines: list[str], line_start: int | None, line_end: int | None) -> str: + if line_start is None or line_end is None: + return "" + if line_start < 1 or line_end < line_start: + return "" + return "\n".join(lines[line_start - 1:line_end]) + + +def block_end_for_braces(lines: list[str], start_index: int) -> int: + depth = 0 + saw_open = False + for index in range(start_index, len(lines)): + line = strip_string_literals(lines[index]) + depth += line.count("{") + if "{" in line: + saw_open = True + depth -= line.count("}") + if saw_open and depth <= 0: + return index + return start_index + + +def line_end_for_statement(lines: list[str], start_index: int) -> int: + for index in range(start_index, len(lines)): + stripped = lines[index].strip() + if stripped.endswith((";", ")", "}")) or index == len(lines) - 1: + return index + return start_index + + +def strip_string_literals(line: str) -> str: + return re.sub(r"(['\"`])(?:\\.|(?!\1).)*\1", "", line) + + +def delimiter_syntax_error(source: str) -> str | None: + pairs = {"(": ")", "[": "]", "{": "}"} + closing = {v: k for k, v in pairs.items()} + stack: list[tuple[str, int]] = [] + in_string: str | None = None + escaped = False + in_block_comment = False + + for line_number, line in enumerate(source.splitlines(), start=1): + index = 0 + while index < len(line): + char = line[index] + next_char = line[index + 1] if index + 1 < len(line) else "" + # Block comments (/* ... */) may span multiple lines and frequently + # contain apostrophes (e.g. "doesn't", "store's") or quotes that + # would otherwise be mistaken for character/string literals. + if in_block_comment: + if char == "*" and next_char == "/": + in_block_comment = False + index += 2 + continue + index += 1 + continue + if in_string: + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == in_string: + in_string = None + index += 1 + continue + if char == "/" and next_char == "*": + in_block_comment = True + index += 2 + continue + if char == "/" and next_char == "/": + break + if char == "'": + # A single quote is ambiguous: it may open a C/C++/Go char + # literal ('x', '\n', '\''), a Rust lifetime ('a, 'static), or + # appear in prose the scanner already routed here. Only consume + # a balanced char literal; otherwise treat the quote as an inert + # character so lifetimes/apostrophes do not trigger a spurious + # "unterminated string literal". + consumed = _char_literal_length(line, index) + if consumed: + index += consumed + continue + index += 1 + continue + if char in {'"', "`"}: + in_string = char + elif char in pairs: + stack.append((char, line_number)) + elif char in closing: + if not stack or stack[-1][0] != closing[char]: + return f"Unmatched delimiter {char!r} at line {line_number}" + stack.pop() + index += 1 + # Double-quoted strings do not span physical lines unless the line ends + # with a backslash continuation; reset a dangling quote so a genuinely + # malformed line does not cascade into later lines. Backtick template + # literals (JS/TS) legitimately span lines, so leave them open. + if in_string == '"' and not line.endswith("\\"): + in_string = None + escaped = False + + if in_block_comment: + return "Unterminated block comment" + if in_string: + return "Unterminated string literal" + if stack: + char, line_number = stack[-1] + return f"Unclosed delimiter {char!r} opened at line {line_number}" + return None + + +def _char_literal_length(line: str, start: int) -> int: + """Return the length of a balanced char literal at ``line[start]`` or 0. + + ``start`` must point at the opening ``'``. Recognizes ``'x'`` and escaped + forms such as ``'\\n'`` / ``'\\''``. Returns 0 when the quote is not a + closed single-character literal (e.g. a Rust lifetime or a stray + apostrophe), signalling the caller to treat it as an inert character. + """ + if start >= len(line) or line[start] != "'": + return 0 + index = start + 1 + if index >= len(line): + return 0 + if line[index] == "\\": + index += 2 # escape sequence consumes backslash + escaped char + else: + index += 1 + if index < len(line) and line[index] == "'": + return index - start + 1 + return 0 + + +def make_unit( + *, + name: str | None, + unit_type: str, + file_path: str, + parent: str | None, + lines: list[str], + line_start: int, + line_end: int, + language: str, + node_type: str, + extra: dict | None = None, +) -> LPCodeUnit: + metadata = { + "language": language, + "line_start": line_start, + "line_end": line_end, + "node_type": node_type, + } + if extra: + metadata.update(extra) + return LPCodeUnit( + name=name, + unit_type=unit_type, + file_path=file_path, + parent=parent, + line_start=line_start, + line_end=line_end, + code=source_slice(lines, line_start, line_end), + language=language, + extra=metadata, + ) + + +def dependency_from_import( + *, + path: str, + module: str | None, + symbol: str | None, + line: int | None, + language: str, + import_kind: str, +) -> LPDependency: + return LPDependency( + src=path, + dst=module, + relation="imports", + symbol=symbol, + line=line, + confidence="unresolved", + extra={"language": language, "import_kind": import_kind}, + ) + + +def class_like_label(unit_type: str) -> str: + if unit_type == "struct": + return "struct" + if unit_type == "interface": + return "interface" + return "class" + + +def top_level_line_indices(lines: Iterable[str], excluded_ranges: list[tuple[int, int]]) -> Iterable[tuple[int, str]]: + for index, line in enumerate(lines): + if any(start <= index <= end for start, end in excluded_ranges): + continue + yield index, line + + +def ts_js_identifier_pattern() -> str: + return _IDENTIFIER diff --git a/CoderMind/scripts/lang_parser/go_parser.py b/CoderMind/scripts/lang_parser/go_parser.py new file mode 100644 index 0000000..b6bbe99 --- /dev/null +++ b/CoderMind/scripts/lang_parser/go_parser.py @@ -0,0 +1,362 @@ +from __future__ import annotations + +import re + +from .base import BaseLanguageParser +from .config.go import GO_CONFIG +from .extractors.fallback import ( + block_end_for_braces, + delimiter_syntax_error, + dependency_from_import, + make_unit, + strip_string_literals, +) +from .models import LPCodeUnit, LPDependency, LPFileResult +from .tree_sitter_backend import TreeSitterBackend + + +_PACKAGE_RE = re.compile(r"^\s*package\s+([A-Za-z_]\w*)\b") +_IMPORT_SINGLE_RE = re.compile(r'^\s*import\s+(?:(?P[\w._]+)\s+)?["`](?P[^"`]+)["`]') +_IMPORT_SPEC_RE = re.compile(r'^\s*(?:(?P[\w._]+)\s+)?["`](?P[^"`]+)["`]') +_GO_IDENTIFIER = r"[A-Za-z_]\w*" +_GO_TYPE_PARAMS = r"\[[^\]]+\]" +_STRUCT_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+struct\b") +_INTERFACE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+interface\b") +_METHOD_RE = re.compile( + rf"^\s*func\s*\(\s*(?:{_GO_IDENTIFIER}\s+)?\*?\s*(?P{_GO_IDENTIFIER})" + rf"\s*(?:{_GO_TYPE_PARAMS})?\s*\)\s*(?P{_GO_IDENTIFIER})\s*(?:{_GO_TYPE_PARAMS})?\s*\(" +) +_FUNCTION_RE = re.compile(rf"^\s*func\s+(?P{_GO_IDENTIFIER})\s*(?:{_GO_TYPE_PARAMS})?\s*\(") +_SELECTOR_CALL_RE = re.compile(r"(?[A-Za-z_]\w*)\.(?P[A-Za-z_]\w*)\s*\(") +_DIRECT_CALL_RE = re.compile(r"(?[A-Za-z_]\w*)\s*\(") +_GO_CALL_KEYWORDS = frozenset({ + "if", "for", "switch", "select", "go", "defer", "return", "func", "range", +}) +_GO_BUILTINS = frozenset({ + "append", "cap", "clear", "close", "complex", "copy", "delete", "imag", "len", + "make", "max", "min", "new", "panic", "print", "println", "real", "recover", +}) + + +class GoParser(BaseLanguageParser): + language = "go" + + def __init__(self) -> None: + self.backend = TreeSitterBackend(GO_CONFIG.tree_sitter_language) + + def parse_file(self, path: str, source: str) -> LPFileResult: + lines = source.splitlines() + units: list[LPCodeUnit] = [] + dependencies: list[LPDependency] = [] + import_aliases: dict[str, str] = {} + index = 0 + while index < len(lines): + line = lines[index] + package_match = _PACKAGE_RE.match(line) + if package_match: + units.append( + make_unit( + name=package_match.group(1), + unit_type="package", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=index + 1, + language=self.language, + node_type="package_clause", + ) + ) + index += 1 + continue + + stripped = line.strip() + if stripped.startswith("import ("): + block_start = index + index += 1 + while index < len(lines): + if lines[index].strip().startswith(")"): + break + spec_match = _IMPORT_SPEC_RE.match(lines[index]) + if spec_match: + import_path = spec_match.group("path") + alias = spec_match.group("alias") + qualifier = self._import_qualifier(import_path, alias) + if qualifier: + import_aliases[qualifier] = import_path + units.append( + make_unit( + name=import_path, + unit_type="import", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=index + 1, + language=self.language, + node_type="import_spec", + extra={"alias": alias, "import_path": import_path, "qualifier": qualifier}, + ) + ) + dep = dependency_from_import( + path=path, + module=import_path, + symbol=alias or import_path, + line=index + 1, + language=self.language, + import_kind="go_import", + ) + dep.extra.update({"alias": alias, "qualifier": qualifier}) + dependencies.append(dep) + index += 1 + index = max(index + 1, block_start + 1) + continue + + import_match = _IMPORT_SINGLE_RE.match(line) + if import_match: + import_path = import_match.group("path") + alias = import_match.group("alias") + qualifier = self._import_qualifier(import_path, alias) + if qualifier: + import_aliases[qualifier] = import_path + units.append( + make_unit( + name=import_path, + unit_type="import", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=index + 1, + language=self.language, + node_type="import_declaration", + extra={"alias": alias, "import_path": import_path, "qualifier": qualifier}, + ) + ) + dep = dependency_from_import( + path=path, + module=import_path, + symbol=alias or import_path, + line=index + 1, + language=self.language, + import_kind="go_import", + ) + dep.extra.update({"alias": alias, "qualifier": qualifier}) + dependencies.append(dep) + index += 1 + continue + + struct_match = _STRUCT_RE.match(line) + if struct_match: + end = block_end_for_braces(lines, index) + units.append( + make_unit( + name=struct_match.group(1), + unit_type="struct", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="struct_type", + extra={"kind": "struct"}, + ) + ) + index = end + 1 + continue + + interface_match = _INTERFACE_RE.match(line) + if interface_match: + end = block_end_for_braces(lines, index) + units.append( + make_unit( + name=interface_match.group(1), + unit_type="interface", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="interface_type", + extra={"kind": "interface"}, + ) + ) + index = end + 1 + continue + + method_match = _METHOD_RE.match(line) + if method_match: + end = block_end_for_braces(lines, index) + receiver_type = method_match.group("receiver").replace("*", "").strip() + units.append( + make_unit( + name=method_match.group("name"), + unit_type="method", + file_path=path, + parent=receiver_type, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="method_declaration", + extra={"receiver_type": receiver_type}, + ) + ) + index = end + 1 + continue + + function_match = _FUNCTION_RE.match(line) + if function_match: + end = block_end_for_braces(lines, index) + units.append( + make_unit( + name=function_match.group("name"), + unit_type="function", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="function_declaration", + ) + ) + index = end + 1 + continue + + index += 1 + + dependencies.extend(self._extract_invokes(path, lines, units, import_aliases)) + + syntax_error = self._syntax_error(source) + return LPFileResult( + file_path=path, + language=self.language, + units=units, + dependencies=dependencies, + syntax_error=syntax_error, + ) + + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + syntax_error = self._syntax_error(source) + return (syntax_error is None, syntax_error) + + def _import_qualifier(self, import_path: str, alias: str | None) -> str | None: + if alias in {".", "_"}: + return None + if alias: + return alias + return import_path.rsplit("/", 1)[-1] + + def _extract_invokes( + self, + path: str, + lines: list[str], + units: list[LPCodeUnit], + import_aliases: dict[str, str], + ) -> list[LPDependency]: + import_ranges = [ + (unit.line_start, unit.line_end) + for unit in units + if unit.unit_type == "import" + and unit.line_start is not None + and unit.line_end is not None + ] + dependencies: list[LPDependency] = [] + seen: set[tuple[str, str, int, str, str | None]] = set() + + for line_number, line in enumerate(lines, start=1): + if any(start <= line_number <= end for start, end in import_ranges): + continue + clean = strip_string_literals(line).split("//", 1)[0] + if clean.lstrip().startswith("func "): + continue + source_ref = self._source_reference_for_line(path, units, line_number) + + for match in _SELECTOR_CALL_RE.finditer(clean): + qualifier = match.group("qualifier") + name = match.group("name") + import_path = import_aliases.get(qualifier) + if not import_path: + continue + key = (source_ref, name, line_number, "selector", qualifier) + if key in seen: + continue + seen.add(key) + dependencies.append( + LPDependency( + src=source_ref, + dst=import_path, + relation="invokes", + symbol=name, + line=line_number, + confidence="high", + extra={ + "language": self.language, + "call_kind": "selector", + "qualifier": qualifier, + "module": import_path, + }, + ) + ) + + for match in _DIRECT_CALL_RE.finditer(clean): + name = match.group("name") + if name in _GO_CALL_KEYWORDS or name in _GO_BUILTINS: + continue + if self._is_direct_call_declaration(clean, match.start("name")): + continue + key = (source_ref, name, line_number, "direct", None) + if key in seen: + continue + seen.add(key) + dependencies.append( + LPDependency( + src=source_ref, + dst=name, + relation="invokes", + symbol=name, + line=line_number, + confidence="high", + extra={"language": self.language, "call_kind": "direct"}, + ) + ) + return dependencies + + def _source_reference_for_line(self, path: str, units: list[LPCodeUnit], line_number: int) -> str: + candidates = [ + unit + for unit in units + if unit.unit_type in {"function", "method"} + and unit.line_start is not None + and unit.line_end is not None + and unit.line_start <= line_number <= unit.line_end + ] + if not candidates: + return path + candidates.sort(key=lambda unit: (unit.line_end or line_number) - (unit.line_start or line_number)) + unit = candidates[0] + if unit.parent and unit.name: + return f"{path}:{unit.parent}.{unit.name}" + if unit.name: + return f"{path}:{unit.name}" + return path + + def _is_direct_call_declaration(self, clean_line: str, match_start: int) -> bool: + prefix = clean_line[:match_start].rstrip() + return prefix.endswith("func") or prefix.endswith("go") or prefix.endswith("defer") + + def _syntax_error(self, source: str) -> str | None: + backend_result = self.backend.validate_syntax(source) + if backend_result is not None: + valid, error = backend_result + if not valid: + return error + delimiter_error = delimiter_syntax_error(source) + if delimiter_error: + return delimiter_error + if not re.search(r"(?m)^\s*package\s+[A-Za-z_]\w*\b", source.strip()): + return "Go source is missing a package clause" + return None diff --git a/CoderMind/scripts/lang_parser/javascript_parser.py b/CoderMind/scripts/lang_parser/javascript_parser.py new file mode 100644 index 0000000..7983bb3 --- /dev/null +++ b/CoderMind/scripts/lang_parser/javascript_parser.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from ._ecmascript_parser import ECMAScriptParser +from .config.javascript import JAVASCRIPT_CONFIG + + +class JavaScriptParser(ECMAScriptParser): + language = "javascript" + + def __init__(self) -> None: + super().__init__(JAVASCRIPT_CONFIG) diff --git a/CoderMind/scripts/lang_parser/models.py b/CoderMind/scripts/lang_parser/models.py new file mode 100644 index 0000000..da3494b --- /dev/null +++ b/CoderMind/scripts/lang_parser/models.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass(frozen=True) +class LanguageConfig: + name: str + display_name: str + extensions: tuple[str, ...] + markdown_fence: str + source_globs: tuple[str, ...] + test_globs: tuple[str, ...] + tree_sitter_language: str | None + class_node_types: tuple[str, ...] + function_node_types: tuple[str, ...] + method_node_types: tuple[str, ...] + import_node_types: tuple[str, ...] + module_path_style: str + default_test_command: tuple[str, ...] | None = None + dependency_files: tuple[str, ...] = () + entrypoint_candidates: tuple[str, ...] = () + + +@dataclass +class LPCodeUnit: + name: str | None + unit_type: str + file_path: str + parent: str | None + line_start: int | None + line_end: int | None + code: str + language: str + extra: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class LPDependency: + src: str + dst: str | None + relation: str + symbol: str | None + line: int | None + confidence: str + extra: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class LPFileResult: + file_path: str + language: str + units: list[LPCodeUnit] + dependencies: list[LPDependency] + syntax_error: str | None = None + + +class NotSupported(Exception): + pass diff --git a/CoderMind/scripts/lang_parser/python_parser.py b/CoderMind/scripts/lang_parser/python_parser.py new file mode 100644 index 0000000..4de7b28 --- /dev/null +++ b/CoderMind/scripts/lang_parser/python_parser.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +import ast +from typing import Optional + +from .base import BaseLanguageParser +from .models import LPCodeUnit, LPDependency, LPFileResult + + +class PythonParser(BaseLanguageParser): + language = "python" + + def parse_file(self, path: str, source: str) -> LPFileResult: + result, _, _ = self.parse_file_with_ast(path, source) + return result + + def parse_file_with_ast( + self, + path: str, + source: str, + ) -> tuple[LPFileResult, ast.Module, SyntaxError | None]: + try: + tree = ast.parse(source) + except SyntaxError as exc: + empty_tree = ast.Module(body=[], type_ignores=[]) + return ( + LPFileResult( + file_path=path, + language=self.language, + units=[], + dependencies=[], + syntax_error=str(exc), + ), + empty_tree, + exc, + ) + + return self._result_from_tree(path, source, tree), tree, None + + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + try: + ast.parse(source) + except SyntaxError as exc: + return False, str(exc) + return True, None + + def _result_from_tree(self, path: str, source: str, tree: ast.Module) -> LPFileResult: + units: list[LPCodeUnit] = [] + dependencies: list[LPDependency] = [] + + for node in tree.body: + if isinstance(node, (ast.Import, ast.ImportFrom)): + units.append(self._make_unit(ast.unparse(node).strip(), "import", path, None, source, node)) + dependencies.extend(self._dependencies_from_import(path, node)) + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + units.append(self._make_unit(node.name, "function", path, None, source, node)) + elif isinstance(node, ast.ClassDef): + units.append(self._make_unit(node.name, "class", path, None, source, node)) + for child in node.body: + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)): + units.append(self._make_unit(child.name, "method", path, node.name, source, child)) + elif isinstance(child, (ast.Assign, ast.AnnAssign)): + units.append( + self._make_unit( + self._extract_assignment_name(child), + "assignment", + path, + node.name, + source, + child, + ) + ) + elif isinstance(node, (ast.Assign, ast.AnnAssign)): + units.append( + self._make_unit( + self._extract_assignment_name(node), + "assignment", + path, + None, + source, + node, + ) + ) + + return LPFileResult( + file_path=path, + language=self.language, + units=units, + dependencies=dependencies, + syntax_error=None, + ) + + def _make_unit( + self, + name: str | None, + unit_type: str, + path: str, + parent: str | None, + source: str, + node: ast.AST, + ) -> LPCodeUnit: + line_start = getattr(node, "lineno", None) + line_end = getattr(node, "end_lineno", line_start) + return LPCodeUnit( + name=name, + unit_type=unit_type, + file_path=path, + parent=parent, + line_start=line_start, + line_end=line_end, + code=self._source_for_node(source, node, line_start, line_end), + language=self.language, + extra={"ast_node": node, "node_type": type(node).__name__}, + ) + + def _source_for_node( + self, + source: str, + node: ast.AST, + line_start: Optional[int], + line_end: Optional[int], + ) -> str: + if line_start is not None and line_end is not None: + lines = source.splitlines() + if 1 <= line_start <= line_end <= len(lines): + return "\n".join(lines[line_start - 1:line_end]) + try: + return ast.unparse(node).strip() + except Exception: + return "" + + def _dependencies_from_import(self, path: str, node: ast.Import | ast.ImportFrom) -> list[LPDependency]: + dependencies: list[LPDependency] = [] + if isinstance(node, ast.Import): + for alias in node.names: + dependencies.append( + LPDependency( + src=path, + dst=alias.name, + relation="imports", + symbol=alias.asname or alias.name, + line=getattr(node, "lineno", None), + confidence="unresolved", + extra={"module": alias.name, "alias": alias.asname}, + ) + ) + else: + module = "." * node.level + (node.module or "") + for alias in node.names: + dependencies.append( + LPDependency( + src=path, + dst=module or None, + relation="imports", + symbol=alias.asname or alias.name, + line=getattr(node, "lineno", None), + confidence="unresolved", + extra={"module": module, "imported": alias.name, "alias": alias.asname}, + ) + ) + return dependencies + + def _extract_assignment_name(self, node: ast.Assign | ast.AnnAssign) -> str | None: + if isinstance(node, ast.Assign): + if node.targets and isinstance(node.targets[0], ast.Name): + return node.targets[0].id + elif isinstance(node, ast.AnnAssign): + if isinstance(node.target, ast.Name): + return node.target.id + return None diff --git a/CoderMind/scripts/lang_parser/registry.py b/CoderMind/scripts/lang_parser/registry.py new file mode 100644 index 0000000..93e8be8 --- /dev/null +++ b/CoderMind/scripts/lang_parser/registry.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import fnmatch +from pathlib import PurePosixPath + +from .base import BaseLanguageParser +from .config import C_CONFIG, CPP_CONFIG, GO_CONFIG, JAVASCRIPT_CONFIG, PYTHON_CONFIG, RUST_CONFIG, TYPESCRIPT_CONFIG +from .models import LanguageConfig, LPFileResult, NotSupported + +_CONFIGS: dict[str, LanguageConfig] = { + PYTHON_CONFIG.name: PYTHON_CONFIG, + GO_CONFIG.name: GO_CONFIG, + TYPESCRIPT_CONFIG.name: TYPESCRIPT_CONFIG, + JAVASCRIPT_CONFIG.name: JAVASCRIPT_CONFIG, + C_CONFIG.name: C_CONFIG, + CPP_CONFIG.name: CPP_CONFIG, + RUST_CONFIG.name: RUST_CONFIG, +} +_PARSERS: dict[str, BaseLanguageParser] = {} + + +def _normalize_path(path: str) -> str: + raw = str(path).replace("\\", "/") + # Preserve a Windows drive prefix (``C:/``) while still dropping an + # RPG/dep-graph symbol suffix (``path:Symbol``). + drive = "" + if len(raw) >= 2 and raw[1] == ":" and raw[0].isalpha(): + drive, raw = raw[:2], raw[2:] + file_part = raw.split(":", 1)[0] + return PurePosixPath(drive + file_part).as_posix().removeprefix("./") + + +def detect_language(path: str) -> str | None: + normalized = _normalize_path(path).lower() + for config in _CONFIGS.values(): + if any(normalized.endswith(extension) for extension in config.extensions): + return config.name + return None + + +def is_supported_source(path: str) -> bool: + return detect_language(path) in _CONFIGS + + +def is_test_file(path: str) -> bool: + config = get_config_for_path(path) + if config is None: + return False + normalized = _normalize_path(path).lower() + return any(fnmatch.fnmatchcase(normalized, pattern.lower()) for pattern in config.test_globs) + + +def get_config(language: str) -> LanguageConfig: + key = language.lower() + try: + return _CONFIGS[key] + except KeyError as exc: + raise NotSupported(f"Unsupported language: {language}") from exc + + +def get_config_for_path(path: str) -> LanguageConfig | None: + language = detect_language(path) + if language is None: + return None + return get_config(language) + + +def get_parser(language: str) -> BaseLanguageParser: + key = language.lower() + if key not in _CONFIGS: + raise NotSupported(f"Unsupported language: {language}") + if key not in _PARSERS: + if key == "python": + from .python_parser import PythonParser + + _PARSERS[key] = PythonParser() + elif key == "go": + from .go_parser import GoParser + + _PARSERS[key] = GoParser() + elif key == "typescript": + from .typescript_parser import TypeScriptParser + + _PARSERS[key] = TypeScriptParser() + elif key == "javascript": + from .javascript_parser import JavaScriptParser + + _PARSERS[key] = JavaScriptParser() + elif key == "c": + from .c_parser import CParser + + _PARSERS[key] = CParser() + elif key == "cpp": + from .cpp_parser import CppParser + + _PARSERS[key] = CppParser() + elif key == "rust": + from .rust_parser import RustParser + + _PARSERS[key] = RustParser() + else: + raise NotSupported(f"Unsupported language: {language}") + return _PARSERS[key] + + +def get_parser_for_file(path: str) -> BaseLanguageParser | None: + language = detect_language(path) + if language is None: + return None + return get_parser(language) + + +def parse_file(path: str, source: str) -> LPFileResult: + parser = get_parser_for_file(path) + if parser is None: + raise NotSupported(f"Unsupported source file: {path}") + return parser.parse_file(path, source) + + +def validate_syntax(path: str, source: str) -> tuple[bool, str | None]: + parser = get_parser_for_file(path) + if parser is None: + return False, f"Unsupported source file: {path}" + return parser.validate_syntax(path, source) + + +def markdown_fence_for_path(path: str) -> str: + config = get_config_for_path(path) + if config is None: + return "text" + return config.markdown_fence + + +def dominant_language(paths) -> str | None: + """Return the most common detectable language across ``paths``, or ``None``. + + Runs :func:`detect_language` on every path; paths whose language + cannot be detected (unknown extension / unsupported language) are + skipped, not voted for ``None``. ``None`` is returned only when + *every* path is unknown (empty or assets-only input). + + Tie-breaking is deterministic on CPython (insertion order) but + callers that care about precise behaviour in mixed-language repos + should pass a curated ``language_map`` to the consumer instead. + """ + counts: dict[str, int] = {} + for p in paths or (): + lang = detect_language(p) + if lang: + counts[lang] = counts.get(lang, 0) + 1 + if not counts: + return None + return max(counts.items(), key=lambda kv: kv[1])[0] diff --git a/CoderMind/scripts/lang_parser/rust_parser.py b/CoderMind/scripts/lang_parser/rust_parser.py new file mode 100644 index 0000000..7602630 --- /dev/null +++ b/CoderMind/scripts/lang_parser/rust_parser.py @@ -0,0 +1,713 @@ +from __future__ import annotations + +import re + +from .base import BaseLanguageParser +from .config.rust import RUST_CONFIG +from .extractors.fallback import ( + block_end_for_braces, + delimiter_syntax_error, + line_end_for_statement, + make_unit, + strip_string_literals, +) +from .models import LPCodeUnit, LPDependency, LPFileResult +from .tree_sitter_backend import TreeSitterBackend + + +_IDENT = r"[A-Za-z_]\w*" +_VIS = r"(?:pub(?:\s*\([^)]*\))?\s+)?" +_CFG_TEST_RE = re.compile(r"^\s*#\s*\[\s*cfg\s*\(\s*test\s*\)\s*\]") +_MOD_DECL_RE = re.compile(rf"^\s*{_VIS}mod\s+(?P{_IDENT})\s*;") +_MOD_INLINE_RE = re.compile(rf"^\s*{_VIS}mod\s+(?P{_IDENT})\s*\{{") +_USE_START_RE = re.compile(rf"^\s*{_VIS}use\s+") +_USE_RE = re.compile(rf"^\s*{_VIS}use\s+(?P[\s\S]+?)\s*;") +_STRUCT_RE = re.compile(rf"^\s*{_VIS}struct\s+(?P{_IDENT})\b") +_ENUM_RE = re.compile(rf"^\s*{_VIS}enum\s+(?P{_IDENT})\b") +_TRAIT_RE = re.compile(rf"^\s*{_VIS}(?:unsafe\s+)?trait\s+(?P{_IDENT})\b") +_IMPL_HEADER_RE = re.compile(r"^\s*impl\b(?P
[^{]*)\{") +_FN_RE = re.compile( + rf"^\s*{_VIS}(?:const\s+)?(?:async\s+)?(?:unsafe\s+)?" + rf"(?:extern\s+\"[^\"]+\"\s+)?fn\s+(?P{_IDENT})\b" +) +_PATH_CALL_RE = re.compile(rf"(?{_IDENT}(?:::{_IDENT})+)\s*(?:::<[^>]+>\s*)?\(") +_DIRECT_CALL_RE = re.compile(rf"(?{_IDENT})\s*(?:::<[^>]+>\s*)?\(") +_ALIAS_RE = re.compile(rf"\bas\s+{_IDENT}$") +_TYPE_IDENT_RE = re.compile(_IDENT) + +_RUST_CALL_KEYWORDS = frozenset({ + "as", + "async", + "await", + "const", + "crate", + "dyn", + "else", + "enum", + "extern", + "fn", + "for", + "if", + "impl", + "in", + "let", + "loop", + "match", + "mod", + "move", + "pub", + "return", + "self", + "Self", + "static", + "struct", + "super", + "trait", + "type", + "unsafe", + "use", + "where", + "while", +}) +_RUST_BUILTINS = frozenset({ + "assert", + "assert_eq", + "assert_ne", + "debug_assert", + "debug_assert_eq", + "debug_assert_ne", + "drop", + "eprint", + "eprintln", + "format", + "panic", + "print", + "println", + "todo", + "unimplemented", + "unreachable", + "Box", + "Err", + "None", + "Ok", + "Some", + "String", + "Vec", +}) +_EXTERNAL_QUALIFIER_PREFIXES = ("std::", "core::", "alloc::") + + +class RustParser(BaseLanguageParser): + language = "rust" + + def __init__(self) -> None: + self.backend = TreeSitterBackend(RUST_CONFIG.tree_sitter_language) + + def parse_file(self, path: str, source: str) -> LPFileResult: + lines = source.splitlines() + clean_lines = self._clean_source_lines(source) + units: list[LPCodeUnit] = [] + dependencies: list[LPDependency] = [] + + index = 0 + skip_next_test_item = False + while index < len(lines): + clean = clean_lines[index].strip() + if not clean: + index += 1 + continue + + if _CFG_TEST_RE.match(clean): + skip_next_test_item = True + index += 1 + continue + + if skip_next_test_item: + skip_next_test_item = False + if "{" in clean: + index = block_end_for_braces(clean_lines, index) + 1 + else: + index = line_end_for_statement(clean_lines, index) + 1 + continue + + mod_match = _MOD_DECL_RE.match(clean) + if mod_match is not None: + self._append_import( + units, + dependencies, + path, + lines, + mod_match.group("name"), + index, + index, + "rust_mod_decl", + "mod_item", + ) + index += 1 + continue + + if _USE_START_RE.match(clean): + statement, end_index = self._collect_statement(clean_lines, index) + use_match = _USE_RE.match(statement) + if use_match is not None: + use_paths = self._expand_use_paths(use_match.group("body")) + multiple = len(use_paths) > 1 + for import_index, use_path in enumerate(use_paths, start=1): + self._append_import( + units, + dependencies, + path, + lines, + use_path, + index, + end_index, + "rust_use", + "use_declaration", + import_index=import_index if multiple else None, + ) + index = end_index + 1 + continue + + inline_mod_match = _MOD_INLINE_RE.match(clean) + if inline_mod_match is not None: + index = block_end_for_braces(clean_lines, index) + 1 + continue + + for unit_type, node_type, pattern in ( + ("struct", "struct_item", _STRUCT_RE), + ("enum", "enum_item", _ENUM_RE), + ): + match = pattern.match(clean) + if match is None: + continue + end = block_end_for_braces(clean_lines, index) if "{" in clean else line_end_for_statement(clean_lines, index) + units.append( + make_unit( + name=match.group("name"), + unit_type=unit_type, + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type=node_type, + extra={"kind": unit_type}, + ) + ) + index = end + 1 + break + else: + trait_match = _TRAIT_RE.match(clean) + if trait_match is not None: + end = block_end_for_braces(clean_lines, index) if "{" in clean else line_end_for_statement(clean_lines, index) + trait_name = trait_match.group("name") + units.append( + make_unit( + name=trait_name, + unit_type="trait", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="trait_item", + extra={"kind": "trait"}, + ) + ) + units.extend(self._extract_trait_methods(path, lines, clean_lines, trait_name, index, end)) + index = end + 1 + continue + + impl_target = self._impl_target_from_line(clean) + if impl_target is not None: + target_type, trait_name = impl_target + end = block_end_for_braces(clean_lines, index) + if trait_name: + dependencies.append( + LPDependency( + src=target_type, + dst=trait_name, + relation="inherits", + symbol=trait_name, + line=index + 1, + confidence="high", + extra={ + "language": self.language, + "relation_kind": "trait_impl", + "type": target_type, + "trait": trait_name, + }, + ) + ) + units.extend(self._extract_impl_methods(path, lines, clean_lines, target_type, index, end)) + index = end + 1 + continue + + fn_match = _FN_RE.match(clean) + if fn_match is not None: + end = self._function_end(clean_lines, index) + units.append( + make_unit( + name=fn_match.group("name"), + unit_type="function", + file_path=path, + parent=None, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type="function_item", + ) + ) + index = end + 1 + continue + + if self._is_unparsed_block_start(clean): + index = block_end_for_braces(clean_lines, index) + 1 + continue + + index += 1 + continue + + dependencies.extend(self._extract_invokes(path, lines, clean_lines, units)) + syntax_error = self._syntax_error(source) + return LPFileResult( + file_path=path, + language=self.language, + units=units, + dependencies=dependencies, + syntax_error=syntax_error, + ) + + def validate_syntax(self, path: str, source: str) -> tuple[bool, str | None]: + syntax_error = self._syntax_error(source) + return (syntax_error is None, syntax_error) + + def _append_import( + self, + units: list[LPCodeUnit], + dependencies: list[LPDependency], + path: str, + lines: list[str], + module: str, + start_index: int, + end_index: int, + import_kind: str, + node_type: str, + import_index: int | None = None, + ) -> None: + extra = {"module": module, "import_path": module, "import_kind": import_kind} + if import_index is not None: + extra["import_index"] = import_index + units.append( + make_unit( + name=module, + unit_type="import", + file_path=path, + parent=None, + lines=lines, + line_start=start_index + 1, + line_end=end_index + 1, + language=self.language, + node_type=node_type, + extra=extra, + ) + ) + dep = LPDependency( + src=path, + dst=module, + relation="imports", + symbol=self._symbol_from_use_path(module), + line=start_index + 1, + confidence="unresolved", + extra={"language": self.language, "import_kind": import_kind, "import_path": module}, + ) + if import_index is not None: + dep.extra["import_index"] = import_index + dependencies.append(dep) + + def _extract_trait_methods( + self, + path: str, + lines: list[str], + clean_lines: list[str], + trait_name: str, + trait_start: int, + trait_end: int, + ) -> list[LPCodeUnit]: + return self._extract_parented_functions(path, lines, clean_lines, trait_name, trait_start, trait_end, "trait_method") + + def _extract_impl_methods( + self, + path: str, + lines: list[str], + clean_lines: list[str], + target_type: str, + impl_start: int, + impl_end: int, + ) -> list[LPCodeUnit]: + return self._extract_parented_functions(path, lines, clean_lines, target_type, impl_start, impl_end, "impl_method") + + def _extract_parented_functions( + self, + path: str, + lines: list[str], + clean_lines: list[str], + parent_name: str, + block_start: int, + block_end: int, + node_type: str, + ) -> list[LPCodeUnit]: + units: list[LPCodeUnit] = [] + depth = self._brace_delta(clean_lines[block_start]) + index = block_start + 1 + while index < block_end: + clean = clean_lines[index].strip() + if depth == 1: + fn_match = _FN_RE.match(clean) + if fn_match is not None: + end = self._function_end(clean_lines, index) + units.append( + make_unit( + name=fn_match.group("name"), + unit_type="method", + file_path=path, + parent=parent_name, + lines=lines, + line_start=index + 1, + line_end=end + 1, + language=self.language, + node_type=node_type, + extra={"receiver_type": parent_name}, + ) + ) + index = end + 1 + depth = 1 + continue + depth += self._brace_delta(clean_lines[index]) + if depth < 0: + depth = 0 + index += 1 + return units + + def _extract_invokes( + self, + path: str, + lines: list[str], + clean_lines: list[str], + units: list[LPCodeUnit], + ) -> list[LPDependency]: + import_ranges = [ + (unit.line_start, unit.line_end) + for unit in units + if unit.unit_type == "import" and unit.line_start is not None and unit.line_end is not None + ] + unit_start_lines = { + unit.line_start + for unit in units + if unit.unit_type in {"struct", "enum", "trait", "function", "method"} and unit.line_start is not None + } + dependencies: list[LPDependency] = [] + seen: set[tuple[str, str, int, str, str | None]] = set() + + for line_number, raw_line in enumerate(clean_lines, start=1): + if any(start <= line_number <= end for start, end in import_ranges): + continue + clean = self._clean_line(raw_line) + if line_number in unit_start_lines: + clean = clean.split("{", 1)[1] if "{" in clean else "" + if not clean: + continue + source_ref = self._source_reference_for_line(path, units, line_number) + + path_spans: list[tuple[int, int]] = [] + for match in _PATH_CALL_RE.finditer(clean): + call_path = match.group("path") + path_spans.append(match.span("path")) + qualifier, symbol = call_path.rsplit("::", 1) + if qualifier.startswith(_EXTERNAL_QUALIFIER_PREFIXES): + continue + if symbol in _RUST_CALL_KEYWORDS or symbol in _RUST_BUILTINS: + continue + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + symbol, + line_number, + "path", + qualifier, + ) + + for match in _DIRECT_CALL_RE.finditer(clean): + name = match.group("name") + if name in _RUST_CALL_KEYWORDS or name in _RUST_BUILTINS: + continue + if any(start <= match.start("name") < end for start, end in path_spans): + continue + if self._is_declaration_call_context(clean, match.start("name")): + continue + self._append_invoke_dependency( + dependencies, + seen, + source_ref, + name, + line_number, + "direct", + None, + ) + return dependencies + + def _append_invoke_dependency( + self, + dependencies: list[LPDependency], + seen: set[tuple[str, str, int, str, str | None]], + source_ref: str, + name: str, + line_number: int, + call_kind: str, + qualifier: str | None, + ) -> None: + key = (source_ref, name, line_number, call_kind, qualifier) + if key in seen: + return + seen.add(key) + extra = {"language": self.language, "call_kind": call_kind} + if qualifier: + extra["qualifier"] = qualifier + dependencies.append( + LPDependency( + src=source_ref, + dst=qualifier or name, + relation="invokes", + symbol=name, + line=line_number, + confidence="high", + extra=extra, + ) + ) + + def _source_reference_for_line(self, path: str, units: list[LPCodeUnit], line_number: int) -> str: + candidates = [ + unit + for unit in units + if unit.unit_type in {"function", "method"} + and unit.line_start is not None + and unit.line_end is not None + and unit.line_start <= line_number <= unit.line_end + ] + if not candidates: + return path + candidates.sort(key=lambda unit: (unit.line_end or line_number) - (unit.line_start or line_number)) + unit = candidates[0] + if unit.parent and unit.name: + return f"{path}:{unit.parent}.{unit.name}" + if unit.name: + return f"{path}:{unit.name}" + return path + + def _collect_statement(self, clean_lines: list[str], start_index: int) -> tuple[str, int]: + parts: list[str] = [] + for index in range(start_index, len(clean_lines)): + parts.append(clean_lines[index].strip()) + statement = " ".join(parts) + if ";" in statement: + return statement, index + return " ".join(parts), start_index + + def _expand_use_paths(self, body: str) -> list[str]: + text = re.sub(r"\s+", "", body.strip()) + expanded = [self._normalize_use_path(path) for path in self._expand_use_expr(text)] + deduped: list[str] = [] + seen: set[str] = set() + for path in expanded: + if not path or path in seen: + continue + seen.add(path) + deduped.append(path) + return deduped + + def _expand_use_expr(self, expr: str) -> list[str]: + brace_index = expr.find("{") + if brace_index == -1: + return [expr] + close_index = self._matching_brace(expr, brace_index) + if close_index == -1: + return [expr] + + prefix = expr[:brace_index] + suffix = expr[close_index + 1:] + body = expr[brace_index + 1:close_index] + results: list[str] = [] + for part in self._split_top_level_commas(body): + if not part: + continue + if part == "self": + combined = prefix[:-2] if prefix.endswith("::") else prefix + else: + combined = f"{prefix}{part}" + results.extend(self._expand_use_expr(f"{combined}{suffix}")) + return results + + def _split_top_level_commas(self, text: str) -> list[str]: + parts: list[str] = [] + start = 0 + depth = 0 + for index, char in enumerate(text): + if char == "{": + depth += 1 + elif char == "}": + depth -= 1 + elif char == "," and depth == 0: + parts.append(text[start:index]) + start = index + 1 + parts.append(text[start:]) + return [part for part in parts if part] + + def _matching_brace(self, text: str, open_index: int) -> int: + depth = 0 + for index in range(open_index, len(text)): + if text[index] == "{": + depth += 1 + elif text[index] == "}": + depth -= 1 + if depth == 0: + return index + return -1 + + def _normalize_use_path(self, path: str) -> str: + normalized = _ALIAS_RE.sub("", path).strip(":") + return normalized.rstrip(",") + + def _symbol_from_use_path(self, path: str) -> str | None: + if not path: + return None + if path.endswith("::*"): + return "*" + return path.rsplit("::", 1)[-1] + + def _impl_target_from_line(self, clean_line: str) -> tuple[str, str | None] | None: + match = _IMPL_HEADER_RE.match(clean_line) + if match is None: + return None + header = match.group("header").strip() + if not header: + return None + header = self._strip_leading_generics(header) + if " for " in header: + trait_part, target_part = header.rsplit(" for ", 1) + target = self._type_name_from_fragment(target_part) + trait_name = self._type_name_from_fragment(trait_part) + if target and trait_name: + return target, trait_name + return None + target = self._type_name_from_fragment(header) + if target: + return target, None + return None + + def _strip_leading_generics(self, header: str) -> str: + stripped = header.strip() + if not stripped.startswith("<"): + return stripped + depth = 0 + for index, char in enumerate(stripped): + if char == "<": + depth += 1 + elif char == ">": + depth -= 1 + if depth == 0: + return stripped[index + 1:].strip() + return stripped + + def _type_name_from_fragment(self, fragment: str) -> str | None: + text = self._remove_angle_groups(fragment) + text = text.replace("&", " ").replace("'", " ") + identifiers = _TYPE_IDENT_RE.findall(text) + ignored = {"as", "const", "dyn", "for", "impl", "mut", "Self", "self", "unsafe", "where"} + candidates = [identifier for identifier in identifiers if identifier not in ignored] + if not candidates: + return None + return candidates[-1] + + def _remove_angle_groups(self, text: str) -> str: + result: list[str] = [] + depth = 0 + for char in text: + if char == "<": + depth += 1 + result.append(" ") + elif char == ">" and depth > 0: + depth -= 1 + result.append(" ") + elif depth == 0: + result.append(char) + return "".join(result) + + def _function_end(self, clean_lines: list[str], start_index: int) -> int: + max_end = min(len(clean_lines), start_index + 12) + statement_parts: list[str] = [] + for end_index in range(start_index, max_end): + statement_parts.append(clean_lines[end_index].strip()) + statement = " ".join(statement_parts) + if "{" in statement: + return block_end_for_braces(clean_lines, end_index) + if ";" in statement: + return end_index + return line_end_for_statement(clean_lines, start_index) + + def _is_declaration_call_context(self, clean_line: str, match_start: int) -> bool: + prefix = clean_line[:match_start].rstrip() + return re.search(r"(?:^|\b)(?:fn|struct|enum|trait|impl|mod|use|if|for|while|match)\s*$", prefix) is not None + + def _is_unparsed_block_start(self, clean_line: str) -> bool: + if "{" not in clean_line: + return False + stripped = clean_line.lstrip() + return stripped.startswith(("extern ", "macro_rules!", "const ", "static ")) + + def _syntax_error(self, source: str) -> str | None: + backend_result = self.backend.validate_syntax(source) + if backend_result is not None: + valid, error = backend_result + if not valid: + return error + return None + return delimiter_syntax_error(source) + + def _clean_source_lines(self, source: str) -> list[str]: + without_block_comments = self._strip_block_comments(source) + return [self._clean_line(line) for line in without_block_comments.splitlines()] + + def _strip_block_comments(self, source: str) -> str: + result_lines: list[str] = [] + in_block = False + for line in source.splitlines(): + index = 0 + chars: list[str] = [] + while index < len(line): + if in_block: + end = line.find("*/", index) + if end == -1: + break + index = end + 2 + in_block = False + continue + start = line.find("/*", index) + if start == -1: + chars.append(line[index:]) + break + chars.append(line[index:start]) + index = start + 2 + in_block = True + result_lines.append("".join(chars)) + return "\n".join(result_lines) + + def _clean_line(self, line: str) -> str: + return strip_string_literals(line).split("//", 1)[0] + + def _brace_delta(self, line: str) -> int: + clean = self._clean_line(line) + return clean.count("{") - clean.count("}") diff --git a/CoderMind/scripts/lang_parser/tree_sitter_backend.py b/CoderMind/scripts/lang_parser/tree_sitter_backend.py new file mode 100644 index 0000000..42e83d6 --- /dev/null +++ b/CoderMind/scripts/lang_parser/tree_sitter_backend.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import importlib +from dataclasses import dataclass +from typing import Any + + +_GRAMMAR_CANDIDATES: dict[str, tuple[tuple[str, str], ...]] = { + "go": (("tree_sitter_go", "language"),), + "typescript": (("tree_sitter_typescript", "language_typescript"),), + "tsx": (("tree_sitter_typescript", "language_tsx"),), + "javascript": (("tree_sitter_javascript", "language"),), + "c": (("tree_sitter_c", "language"),), + "cpp": (("tree_sitter_cpp", "language"),), + "rust": (("tree_sitter_rust", "language"),), +} + + +@dataclass(frozen=True) +class TreeSitterParseResult: + tree: Any + source_bytes: bytes + + +class TreeSitterBackend: + """Lazy wrapper around optional tree-sitter grammar packages.""" + + def __init__(self, language_name: str | None): + self.language_name = language_name + self._language: Any | None = None + self._parser: Any | None = None + self._load_error: str | None = None + + @property + def load_error(self) -> str | None: + return self._load_error + + def is_available(self) -> bool: + return self.get_parser() is not None + + def get_language(self) -> Any | None: + if self._language is not None: + return self._language + if not self.language_name: + self._load_error = "No tree-sitter language configured" + return None + + candidates = _GRAMMAR_CANDIDATES.get(self.language_name, ()) + if not candidates: + self._load_error = f"No grammar candidate configured for {self.language_name}" + return None + + try: + from tree_sitter import Language + except Exception as exc: + self._load_error = f"tree-sitter is unavailable: {exc}" + return None + + errors: list[str] = [] + for module_name, factory_name in candidates: + try: + module = importlib.import_module(module_name) + factory = getattr(module, factory_name) + raw_language = factory() + if isinstance(raw_language, Language): + self._language = raw_language + else: + self._language = Language(raw_language) + return self._language + except Exception as exc: + errors.append(f"{module_name}.{factory_name}: {exc}") + + self._load_error = "; ".join(errors) if errors else "No grammar candidates tried" + return None + + def get_parser(self) -> Any | None: + if self._parser is not None: + return self._parser + + language = self.get_language() + if language is None: + return None + + try: + from tree_sitter import Parser + + parser = Parser() + if hasattr(parser, "set_language"): + parser.set_language(language) + else: + parser.language = language + self._parser = parser + return self._parser + except Exception as exc: + self._load_error = f"tree-sitter parser setup failed: {exc}" + return None + + def parse(self, source: str) -> TreeSitterParseResult | None: + parser = self.get_parser() + if parser is None: + return None + source_bytes = source.encode("utf-8", errors="replace") + try: + return TreeSitterParseResult(tree=parser.parse(source_bytes), source_bytes=source_bytes) + except Exception as exc: + self._load_error = f"tree-sitter parse failed: {exc}" + return None + + def validate_syntax(self, source: str) -> tuple[bool, str | None] | None: + parsed = self.parse(source) + if parsed is None: + return None + root = getattr(parsed.tree, "root_node", None) + if root is not None and getattr(root, "has_error", False): + return False, "tree-sitter reported syntax errors" + return True, None diff --git a/CoderMind/scripts/lang_parser/typescript_parser.py b/CoderMind/scripts/lang_parser/typescript_parser.py new file mode 100644 index 0000000..e4a6581 --- /dev/null +++ b/CoderMind/scripts/lang_parser/typescript_parser.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from ._ecmascript_parser import ECMAScriptParser +from .config.typescript import TYPESCRIPT_CONFIG + + +class TypeScriptParser(ECMAScriptParser): + language = "typescript" + + def __init__(self) -> None: + super().__init__(TYPESCRIPT_CONFIG) diff --git a/CoderMind/scripts/plan.py b/CoderMind/scripts/plan.py index f3733b7..5996b1a 100644 --- a/CoderMind/scripts/plan.py +++ b/CoderMind/scripts/plan.py @@ -247,11 +247,7 @@ def probe(invoker: list[str]) -> list[StageState]: stage=stage, type=type_, message=str(result.get("message", "")), - # Treat ``warning`` as complete (matches ``decide()``): - # the artefact is present and usable, only a soft - # inconsistency was flagged. Anything else (``init`` / - # ``error``) is incomplete. - done=(type_ in ("update", "warning")), + done=(type_ == "update"), raw=result, ) ) @@ -261,16 +257,10 @@ def probe(invoker: list[str]) -> list[StageState]: def decide(states: list[StageState], force: bool) -> None: """Mark each state's ``will_run`` / ``reason`` in place. - Rule: any stage with ``type not in {"update", "warning"}`` runs. - ``warning`` means the artefact is present and usable but a soft - inconsistency was detected (e.g. tasks.json with auxiliary tasks - lacking a 1:1 interface mapping). Treating ``warning`` the same as - ``update`` here keeps re-runs idempotent: a stage that successfully - produced a warning-state artefact will not be rebuilt on the next - ``cmind script plan.py`` invocation. Once any stage runs, *all* - downstream stages run too (cascade), so derived artifacts never get - out of sync with regenerated upstream ones. ``--force`` flips every - stage to ``will_run``. + Rule: only ``type == "update"`` is complete. Any warning means the + artifact exists but violates a cross-stage contract, so the stage is + rerun and downstream artifacts are rebuilt from it. ``--force`` flips + every stage to ``will_run``. """ cascade = False for state in states: @@ -282,9 +272,9 @@ def decide(states: list[StageState], force: bool) -> None: state.will_run = True state.reason = "upstream rebuilt" continue - if state.type in ("update", "warning"): + if state.type == "update": state.will_run = False - state.reason = "up-to-date" if state.type == "update" else "up-to-date (warning)" + state.reason = "up-to-date" else: state.will_run = True state.reason = f"type={state.type}" @@ -433,7 +423,7 @@ def main(argv: Optional[list[str]] = None) -> int: _install_sigint_handler() invoker = _resolve_invoker() - # --- Phase 1: probe ---------------------------------------------------- + # --- Step: probe ------------------------------------------------------ states = probe(invoker) decide(states, force=args.force) @@ -444,7 +434,7 @@ def main(argv: Optional[list[str]] = None) -> int: _print_probe_summary(states) return 0 - # --- Phase 1b: prerequisite check -------------------------------------- + # --- Step: prerequisite check ----------------------------------------- # If the very first stage cannot even start (its input is missing or # invalid), abort cleanly so the user gets a helpful pointer instead # of a confusing failure from the build script itself. ``--dry-run`` @@ -463,14 +453,14 @@ def main(argv: Optional[list[str]] = None) -> int: ) return 2 - # --- Phase 2: short-circuit when nothing to do ------------------------- + # --- Step: short-circuit when nothing to do --------------------------- runnable = [s for s in states if s.will_run] if not runnable: print("All 5 planning stages are already complete — nothing to do.") print("Use `cmind script plan.py --force` to rebuild from scratch.") return 0 - # --- Phase 3: announce plan ------------------------------------------- + # --- Step: announce plan ---------------------------------------------- print(f"Planning pipeline: {len(runnable)} of {len(states)} stages to run.") print(_format_table(states)) print() @@ -484,7 +474,7 @@ def main(argv: Optional[list[str]] = None) -> int: print("DRY-RUN ▸", " ".join(_script_argv(invoker, post))) return 0 - # --- Phase 4: execute -------------------------------------------------- + # --- Step: execute ---------------------------------------------------- started = time.monotonic() for s in states: if not s.will_run: @@ -504,21 +494,13 @@ def main(argv: Optional[list[str]] = None) -> int: # fails, otherwise the user would see a JSON dump after every # stage. # - # ``update`` -> stage is fully valid; continue. - # ``warning`` -> artefact is usable but a soft inconsistency was - # detected (e.g. tasks.json having auxiliary tasks - # without a 1:1 interface mapping). Print the - # message and continue; do not fail the pipeline. - # ``init`` / ``error`` -> artefact is missing or unusable; fail. + # ``update`` -> stage is fully valid; continue. + # Any other type means the artifact is missing, unusable, or + # violates a cross-stage contract; fail so bench cannot report a + # false PASS for partial plans. verify = _run_check(invoker, s.stage.check_script) verify_type = verify.get("type", "error") - if verify_type == "warning": - print( - f" warning: {verify.get('message', 'no message')}" - f" (continuing)", - file=sys.stderr, - ) - elif verify_type != "update": + if verify_type != "update": print( f" verification failed: {verify_type} — " f"{verify.get('message', 'no message')}", @@ -532,7 +514,7 @@ def main(argv: Optional[list[str]] = None) -> int: elapsed = time.monotonic() - stage_started print(f"✓ {s.stage.name:<14} done in {elapsed:.1f}s") - # --- Phase 5: post-pipeline helpers ----------------------------------- + # --- Step: post-pipeline helpers -------------------------------------- print() print("Running post-pipeline helpers ...") for post in POST_STEPS: diff --git a/CoderMind/scripts/plan_tasks.py b/CoderMind/scripts/plan_tasks.py index 94d19d9..704df41 100644 --- a/CoderMind/scripts/plan_tasks.py +++ b/CoderMind/scripts/plan_tasks.py @@ -23,6 +23,8 @@ from common.trajectory import Trajectory, load_or_create_trajectory from common import LLMClient +from common.language_meta import extract_language_metadata, metadata_with_languages +from decoder_lang import ProjectTaskContext, get_backend from rpg import uuid8 # Import centralized paths @@ -264,6 +266,61 @@ def _extract_imported_modules(file_code: str, current_file: str) -> Set[str]: return imported_modules +def _extract_imported_stems_via_parser(file_path: str, file_code: str) -> Set[str]: + """Extract imported file stems for non-Python languages via ``lang_parser``. + + Python intra-subtree ordering uses dotted-module matching + (:func:`_extract_imported_modules`). Other languages express imports very + differently (Go import paths, Rust ``use`` paths, JS/TS ``from './x.js'``, + C/C++ ``#include "x.h"``), so an ``ast.parse`` of their source just yields + nothing and the topo-sort silently degrades to the LLM's raw order. + + Here we let ``lang_parser`` (which understands all supported languages) + extract the import targets, then reduce each to its basename stem so the + caller can match it against the basenames of the subtree's own files. This + is intentionally conservative: it only links files whose import target + shares a basename with a sibling file, which is the common intra-module + case and never raises across languages. + """ + if not file_code.strip(): + return set() + try: + from lang_parser import get_parser_for_file + except ImportError: + return set() + parser = get_parser_for_file(file_path) + if parser is None: + return set() + try: + result = parser.parse_file(file_path, file_code) + except Exception: + return set() + + stems: Set[str] = set() + for dep in getattr(result, "dependencies", []) or []: + if getattr(dep, "relation", None) != "imports": + continue + target = (getattr(dep, "dst", None) or getattr(dep, "symbol", None) or "") + if not isinstance(target, str) or not target.strip(): + continue + # Reduce an import target to a comparable basename stem: + # "./store.js" -> "store", "tasklite/internal/store" -> "store", + # "crate::store::Task" -> "store" (last path-ish segment), + # "store.h" -> "store". + token = target.replace("\\", "/").strip().strip('"').strip("'") + token = token.split("/")[-1] + token = token.split("::")[-1] + token = token.rsplit(".", 1)[0] if "." in token else token + if token: + stems.add(token) + return stems + + +def _file_basename_stem(file_path: str) -> str: + """Return the lowercase basename without extension for cross-language matching.""" + return Path(file_path).stem.lower() + + def _load_dependency_source_code(file_path: str, interface_file_code: str) -> str: """Load source code for dependency analysis, combining repo and interface inputs.""" code_parts: List[str] = [] @@ -328,6 +385,7 @@ def correct_intra_subtree_file_order( files_order: List[str], subtree_interfaces: Dict[str, Dict[str, Any]], logger: Optional[logging.Logger] = None, + language: Optional[str] = None, ) -> tuple[List[str], Dict[str, Any]]: """Correct file order using imports declared in interface skeleton code.""" logger = logger or logging.getLogger(__name__) @@ -341,10 +399,18 @@ def correct_intra_subtree_file_order( "reason": "single_file_or_empty_subtree", } + # Python matches imports by dotted module path; other languages match by + # file basename stem (Go/Rust/TS/JS/C/C++ import syntaxes differ too much + # for a single dotted-path scheme). + is_python = (language or "python").lower() == "python" + module_to_file = { _file_path_to_module_name(file_path): file_path for file_path in available_files } + stem_to_file: Dict[str, str] = {} + for file_path in available_files: + stem_to_file.setdefault(_file_basename_stem(file_path), file_path) dependency_edges: Dict[str, Set[str]] = defaultdict(set) dependency_pairs: List[Dict[str, str]] = [] seen_dependency_pairs: Set[tuple[str, str, str]] = set() @@ -354,10 +420,15 @@ def correct_intra_subtree_file_order( file_path=file_path, interface_file_code=subtree_interfaces[file_path].get("file_code", ""), ) - imported_modules = _extract_imported_modules(file_code, file_path) - - for module_name in sorted(imported_modules): - dependency_file = module_to_file.get(module_name) + if is_python: + imported = sorted(_extract_imported_modules(file_code, file_path)) + resolve = module_to_file.get + else: + imported = sorted(_extract_imported_stems_via_parser(file_path, file_code)) + resolve = lambda stem: stem_to_file.get(stem) # noqa: E731 + + for module_name in imported: + dependency_file = resolve(module_name) if not dependency_file or dependency_file == file_path: continue dependency_edges[dependency_file].add(file_path) @@ -398,7 +469,7 @@ def correct_intra_subtree_file_order( "corrected_files_order": list(corrected_order), "changed": changed, "dependency_edges": dependency_pairs, - "reason": "ast_import_toposort", + "reason": "import_toposort" if is_python else "import_toposort_by_stem", } @@ -738,6 +809,11 @@ def __init__( self.repo_info = repo_info self.debug = debug self.trajectory = trajectory + self.primary_language = ( + extract_language_metadata(interfaces)[0] + or extract_language_metadata(data_flow)[0] + ) + self.backend = get_backend(self.primary_language) self.llm: Optional[LLMClient] = None self.logger = logging.getLogger(__name__) @@ -794,6 +870,7 @@ def plan(self) -> Dict[str, Any]: files_order=files_order, subtree_interfaces=subtree_interfaces, logger=self.logger, + language=self.primary_language, ) self.file_order_diagnostics[subtree] = order_diagnostics @@ -881,6 +958,11 @@ def plan(self) -> Dict[str, Any]: planned_tasks_serializable[subtree][file_path] = valid_tasks result = { + "meta": metadata_with_languages( + self.interfaces + if extract_language_metadata(self.interfaces)[0] + else self.data_flow + ), "planned_tasks_dict": planned_tasks_serializable, "agent_results_dict": self.agent_results_dict, "file_order_diagnostics": self.file_order_diagnostics, @@ -915,6 +997,11 @@ def plan(self) -> Dict[str, Any]: updated_subtree_order = subtree_order + ["FINAL_TASKS", "PROJECT_FILES"] result = { + "meta": metadata_with_languages( + self.interfaces + if extract_language_metadata(self.interfaces)[0] + else self.data_flow + ), "planned_tasks_dict": planned_tasks_serializable, "agent_results_dict": self.agent_results_dict, "file_order_diagnostics": self.file_order_diagnostics, @@ -1010,7 +1097,7 @@ def _add_special_tasks( "- Module A defines function but Module B never imports/calls it\n" "- Data format mismatch at module boundary\n" "- CSS class names in templates not matching stylesheet definitions\n" - "\nDo NOT create main.py — it will be created in a later task." + "\nDo NOT create the main entry point — it will be created in a later task." ), file_path="", units_key=["cross_module_wiring"], @@ -1046,7 +1133,7 @@ def _add_special_tasks( "For ALL other project types, follow these steps:\n\n" "## Step 1: Inventory existing assets\n" "List all files related to user-facing output:\n" - "- Style modules (styles.py, *.css, theme files)\n" + "- Style, theme, or presentation files when the project type uses them\n" "- Template/page/view files\n" "- Layout/component files\n" "- Static assets directory\n" @@ -1079,7 +1166,7 @@ def _add_special_tasks( "- GUI: verify window opens without errors\n" "- CLI: verify --help output and a basic command run\n" "- Write tests that assert key structural elements\n\n" - "Do NOT create main.py — it will be created in a later task." + "Do NOT create the main entry point — it will be created in a later task." ), file_path="", units_key=["ui_polish"], @@ -1109,7 +1196,7 @@ def _add_special_tasks( "In addition, create clear usage examples (e.g., example scripts or notebooks) that demonstrate " "typical end-to-end workflows. " "Place the new test files and examples in appropriate locations in the project structure. " - "NOTE: The main entry point (main.py) will be created in the next task — " + "NOTE: The main entry point will be created in the next task — " "do NOT create it here." ), file_path="", # Special marker - let agent decide placement @@ -1194,8 +1281,8 @@ def _add_project_file_tasks( so they can reference the actual code content. Task types: - - project_requirements: requirements.txt (needs import validation test) - - project_docs: README.md (no tests needed) + - project_requirements: language-specific dependency metadata + - project_docs: README.md """ print("\n Adding project file tasks...") @@ -1209,7 +1296,7 @@ def _add_project_file_tasks( task=self._build_requirements_task(), file_path="", units_key=["requirements_generation"], - unit_to_code={"requirements_generation": "# Generate requirements.txt"}, + unit_to_code={"requirements_generation": "# Generate dependency metadata"}, unit_to_features={"requirements_generation": ["dependency management"]}, priority=3000, # After main_entry (2100) subtree=pf_subtree, @@ -1217,7 +1304,7 @@ def _add_project_file_tasks( ) planned_tasks[pf_subtree][""] = [requirements_task.to_dict()] agent_results[pf_subtree][""] = {"success": True, "type": "project_requirements"} - print(" - Added requirements.txt generation task (with import test)") + print(" - Added dependency metadata task (with validation test)") # 2. README documentation task (no tests needed) readme_task = PlannedTask( @@ -1235,9 +1322,95 @@ def _add_project_file_tasks( print(" - Added README.md generation task (no test)") self.logger.info("Added 2 project file tasks") + + def _backend_project_task_templates(self): + """Return backend-owned project task templates when available.""" + return self.backend.project_task_templates( + ProjectTaskContext( + repo_name=self.repo_name, + repo_info=self.repo_info, + package_name=self._package_slug(separator="-"), + entry_point_path=self._reconciled_entry_point_path(), + ) + ) + + def _reconciled_entry_point_path(self) -> Optional[str]: + """Resolve the program entry path from already-designed interfaces. + + Reuses an existing language-appropriate entry file when the + skeleton already placed one (e.g. C++ ``src/cli/main.cpp`` off the + canonical ``src/main.cpp``, or Go ``cmd//main.go``), so the + synthetic MAIN_ENTRY task extends it instead of generating a + SECOND entry — the dual-``main`` bug. The per-language matching + rule lives in ``backend.find_existing_entry`` (filename match by + default; Go encodes the ``cmd/*/main.go`` shape). Returns ``None`` + when the skeleton declared no entry, letting the backend use its + canonical path. + """ + return self.backend.find_existing_entry(self.interfaces) def _build_requirements_task(self) -> str: - """Build task description for requirements.txt generation.""" + """Build task description for dependency metadata generation.""" + templates = self._backend_project_task_templates() + if templates is not None: + return templates.dependencies + + if self.backend.name == "go": + module_name = self._go_module_name() + return f"""Generate or update Go module dependency files for the repository: {self.repo_name} + +**Files to create/update:** +1. `go.mod` - Go module declaration using module path `{module_name}` +2. `go.sum` - Only if external dependencies are introduced + +**Instructions:** +1. Prefer the Go standard library. Do not add third-party dependencies unless the implemented code already requires them. +2. If there are no external dependencies, create a minimal `go.mod` with a current Go version. +3. If dependencies are needed, run `go mod tidy` after adding imports. +4. Verify the module with `go test ./...`. + +**Important:** +- Do NOT create Python dependency files for a Go project. +- Keep the module compact and local to this repository. +- The fixture expects standard-library-only code unless the implementation proves otherwise. +""" + if self.backend.name == "rust": + package_name = self._package_slug(separator="-") + return f"""Generate or update Rust Cargo dependency files for the repository: {self.repo_name} + +**Files to create/update:** +1. `Cargo.toml` - Cargo package declaration using package name `{package_name}` +2. `Cargo.lock` - Only if dependency resolution creates it + +**Instructions:** +1. Prefer the Rust standard library for CLI parsing and file handling unless implemented code already requires a crate. +2. Include `serde` and `serde_json` when JSON serialization is implemented. +3. Use edition `2021` unless the implemented code requires another stable edition. +4. Run `cargo test` after updating dependencies. + +**Important:** +- Do NOT create Python dependency files for a Rust project. +- Keep dependency choices minimal and justified by actual imports. +""" + if self.backend.name == "typescript": + package_name = self._package_slug(separator="-") + return f"""Generate or update Node.js/TypeScript dependency files for the repository: {self.repo_name} + +**Files to create/update:** +1. `package.json` - Package metadata, scripts, and dependencies using package name `{package_name}` +2. `tsconfig.json` - TypeScript compiler configuration for Node.js +3. `package-lock.json` - Only if dependency installation creates it + +**Instructions:** +1. Prefer Node.js standard APIs for local file and CLI behavior. +2. Add TypeScript tooling and a minimal test runner only when needed by the implemented code. +3. Provide scripts for `npm start`, `npm test`, and type checking when appropriate. +4. Run `npm test` after updating dependencies. + +**Important:** +- Do NOT create Python dependency files for a TypeScript project. +- Keep dependencies minimal and aligned with actual imports. +""" return f"""Generate or update the dependency management files for the repository: {self.repo_name} **Files to create/update:** @@ -1281,6 +1454,95 @@ def _build_requirements_task(self) -> str: def _build_main_entry_task(self) -> str: """Build task description for main entry point generation.""" + templates = self._backend_project_task_templates() + if templates is not None: + return templates.main_entry + + if self.backend.name == "go": + command_path = self._resolve_go_command_path() + return f"""Create the Go command entry point for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Create a production-quality Go CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `{command_path}` - Main package for the CLI command. + +**Critical Rules:** +- Do NOT re-implement business logic in `main.go`. Import and delegate to internal packages already defined in the project. +- Every import must reference real packages and symbols from this module. +- Use idiomatic Go error handling with explicit non-zero exits on user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. +- This is the ONLY `package main` / `func main()` in the repository. If `{command_path}` already exists, extend it in place — do NOT create a second command package. + +**Requirements:** +1. Use `package main` and a `main()` function. +2. Provide `--help` output and subcommands/options that expose all major CLI features. +3. Delegate to implemented internal packages for task storage and task lifecycle behavior. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `go run ./{command_path.rsplit('/', 1)[0]} --help` and `go test ./...`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this Go project. +""" + + if self.backend.name == "rust": + return f"""Create the Rust command entry point for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Create a production-quality Cargo CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `src/main.rs` - Binary entry point for the CLI. +2. `src/lib.rs` (optional) - Library module that exposes reusable task/store logic. + +**Critical Rules:** +- Do NOT re-implement business logic in `main.rs`. Delegate to modules already defined in the crate. +- Every `use` path must reference real modules and symbols. +- Use idiomatic `Result`-based error handling and explicit non-zero exits for user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. + +**Requirements:** +1. Provide a `main()` function in `src/main.rs`. +2. Expose all major CLI commands and options described in `docs/`. +3. Delegate storage and task lifecycle behavior to implemented modules. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `cargo run -- --help` and `cargo test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this Rust project. +""" + + if self.backend.name == "typescript": + return f"""Create the TypeScript command entry point for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Create a production-quality Node.js CLI entry point that lets users run the complete product through documented commands. + +**Files to create:** +1. `src/index.ts` - CLI entry point exported or referenced by package scripts. +2. `src/cli.ts` (optional) - Command parsing and dispatch separated from domain logic. + +**Critical Rules:** +- Do NOT re-implement business logic in `index.ts`. Import and delegate to implemented modules. +- Every import must reference real files and exported symbols. +- Use explicit error handling and non-zero process exits for user-facing failures. +- Keep output plain text unless the requirements explicitly ask otherwise. + +**Requirements:** +1. Expose all major CLI commands and options described in `docs/`. +2. Wire `package.json` scripts so users can run the CLI with `npm start -- --help`. +3. Delegate storage and task lifecycle behavior to implemented modules. +4. Handle invalid commands, invalid ids, missing arguments, and runtime errors clearly. +5. Verify with `npm start -- --help` and `npm test`. + +**Important:** +- Read `docs/` first and faithfully expose the requested behavior. +- Do NOT create Python package entry points for this TypeScript project. +""" + # Infer the main package name from the interfaces subtree structure package_name = self._get_package_name() @@ -1359,6 +1621,18 @@ def main(args: Optional[list] = None) -> int: - Reference ONLY actual module names, classes, and functions from the codebase - Provide meaningful default behaviors so `python main.py` does something useful - The entry point should feel like a finished product, not a scaffold +- **Make `python main.py` work from a clean checkout.** If the package lives + under `src/` (e.g. `src/{package_name}/`), a bare `python main.py` will raise + `ModuleNotFoundError` because `src/` is not on `sys.path`. You MUST make the + import resolvable by ONE of: + 1. Adding a `pyproject.toml` with `[tool.setuptools] packages` discovery + under `src` (`package-dir = {{"" = "src"}}`), so an editable/normal + install exposes the package; OR + 2. Inserting a path bridge at the very top of `main.py`, before importing + the package: + `import sys, pathlib; sys.path.insert(0, str(pathlib.Path(__file__).parent / "src"))` + Prefer (1) for installable projects; (2) is the minimal always-works bridge. + Do NOT rely on the caller exporting `PYTHONPATH`. - **Read the `docs/` directory first** — it contains the user's original requirements and feature specifications. Make sure the entry point faithfully exposes all requested features and does NOT deviate from the intended purpose. @@ -1418,8 +1692,177 @@ def _get_package_name(self) -> str: return self.repo_name.lower().replace("-", "_").replace(" ", "_") return "project" + def _go_module_name(self) -> str: + """Infer a compact Go module or command name from repository metadata.""" + raw = self.repo_name or "project" + candidate = raw.lower().replace(" ", "-").replace("_", "-") + candidate = _re.sub(r"[^a-z0-9-]+", "-", candidate).strip("-") + return candidate or "project" + + def _resolve_go_command_path(self) -> str: + """Return the Go entry-point path, reusing the skeleton's own if present. + + The skeleton frequently already places the program entry under + ``cmd//main.go`` (e.g. ``cmd/todo/main.go``). Generating a + second ``cmd//main.go`` from the synthetic MAIN_ENTRY + task then yields two ``func main()`` packages. To keep a single + entry source, reuse an existing ``cmd/*/main.go`` discovered in + the planned interfaces; only fall back to the backend's canonical + ``cmd//main.go`` when the skeleton declared no command + package. + """ + subtrees_data = self.interfaces.get("subtrees", {}) + for st_data in subtrees_data.values(): + container = st_data.get("interfaces", st_data.get("files", {})) + for fpath in container: + norm = str(fpath).replace("\\", "/") + parts = norm.split("/") + if ( + len(parts) == 3 + and parts[0] == "cmd" + and parts[2] == "main.go" + ): + return norm + return self.backend.entry_point_path(self._go_module_name()) + + def _package_slug(self, separator: str = "-") -> str: + """Infer a compact package name from repository metadata.""" + raw = self.repo_name or "project" + candidate = raw.lower().replace(" ", separator).replace("_", separator) + candidate = _re.sub(rf"[^a-z0-9{_re.escape(separator)}]+", separator, candidate) + candidate = _re.sub(rf"{_re.escape(separator)}+", separator, candidate) + return candidate.strip(separator) or "project" + def _build_readme_task(self) -> str: """Build task description for README.md generation.""" + templates = self._backend_project_task_templates() + if templates is not None: + return templates.readme + + if self.backend.name == "go": + module_name = self._go_module_name() + return f"""Update the README.md for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual Go CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Go version prerequisite +- Clone/build instructions +- Module setup using `go mod tidy` when needed + +## 3. Usage +- How to run the CLI with `go run ./cmd/{module_name} --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `cmd/`, `internal/`, and test files +- Key packages and their purposes + +## 5. Development +- How to run tests with `go test ./...` +- How to format code with `gofmt` + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual Go codebase to understand what was implemented. +3. Run `go run ./cmd/{module_name} --help` if the command exists. +4. Reference actual package names, types, and functions. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this Go project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""" + if self.backend.name == "rust": + return f"""Update the README.md for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual Rust CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Rust/Cargo prerequisite +- Clone/build instructions +- Dependency setup with `cargo build` + +## 3. Usage +- How to run the CLI with `cargo run -- --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `src/`, modules, and tests +- Key modules and their purposes + +## 5. Development +- How to run tests with `cargo test` +- How to format code with `cargo fmt` + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual Rust codebase to understand what was implemented. +3. Run `cargo run -- --help` if the binary exists. +4. Reference actual module names, structs, traits, enums, and functions. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this Rust project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""" + if self.backend.name == "typescript": + return f"""Update the README.md for the repository: {self.repo_name} +Repository purpose: {self.repo_info} + +**Goal:** Replace the placeholder README with comprehensive documentation for the actual TypeScript CLI implementation. + +**Sections to include:** + +## 1. Project Title & Description +- Clear, concise description of what the CLI does +- Key commands and capabilities + +## 2. Installation +- Node.js/npm prerequisite +- Clone/install instructions using `npm install` +- TypeScript build or runtime notes if applicable + +## 3. Usage +- How to run the CLI with `npm start -- --help` +- Common command examples with expected plain-text output +- Data file options and local persistence behavior if applicable + +## 4. Project Structure +- Brief overview of `src/`, `tests/`, and configuration files +- Key modules and their purposes + +## 5. Development +- How to run tests with `npm test` +- How to type-check or build the project + +**Instructions:** +1. Read the `docs/` directory for the original requirements. +2. Explore the actual TypeScript codebase to understand what was implemented. +3. Run `npm start -- --help` if the script exists. +4. Reference actual module names, exported types, classes, and functions. + +**Important:** +- Do NOT document Python commands, Python test runners, or Python dependency files for this TypeScript project. +- Base everything on the actual implemented code, not assumptions. +- Keep the tone professional and concise. +""" return f"""Update the README.md for the repository: {self.repo_name} Repository purpose: {self.repo_info} diff --git a/CoderMind/scripts/rpg/builder.py b/CoderMind/scripts/rpg/builder.py index 1b92fdc..33b747c 100644 --- a/CoderMind/scripts/rpg/builder.py +++ b/CoderMind/scripts/rpg/builder.py @@ -13,6 +13,8 @@ from typing import Dict, Any, Union from pathlib import Path +from common.language_meta import extract_language_metadata + from .models import RPG, Node, NodeMetaData, NodeType, uuid8 @@ -42,6 +44,9 @@ def create_initial_rpg(repo_data: Dict[str, Any]) -> RPG: # Set generator for repo node (created in RPG.__init__) if rpg.repo_node: rpg.repo_node.meta.generator = "build_skeleton" + target_language = extract_language_metadata(repo_data)[0] + if target_language: + rpg.repo_node.meta.language = target_language logging.info(f"Creating initial RPG for repository: {repo_name}") logging.info(f"Found {len(repo_cmpt)} components to process") diff --git a/CoderMind/scripts/rpg/code_unit.py b/CoderMind/scripts/rpg/code_unit.py index 632df60..956cff0 100644 --- a/CoderMind/scripts/rpg/code_unit.py +++ b/CoderMind/scripts/rpg/code_unit.py @@ -74,14 +74,16 @@ def lineno(self) -> Optional[int]: """Get starting line number.""" if isinstance(self.node, ast.AST): return getattr(self.node, "lineno", None) - return None + # For non-AST nodes (units produced by lang_parser for Go/TS/...), + # the line info lives in ``extra``. + return self.extra.get("line_start") @property def end_lineno(self) -> Optional[int]: """Get ending line number.""" if isinstance(self.node, ast.AST): return getattr(self.node, "end_lineno", None) - return None + return self.extra.get("line_end") @property def is_top_level(self) -> bool: @@ -345,25 +347,119 @@ def from_dict(data: Dict[str, Any]) -> "CodeUnit": class ParsedFile: - """Parses a Python file and extracts CodeUnits. - + """Parses a source file and extracts CodeUnits. + + For Python files, uses the built-in ``ast`` module. For other + supported source languages (Go, TypeScript/JavaScript, + C/C++, Rust, ...), delegates to ``lang_parser`` and adapts the result + into the same ``CodeUnit`` shape so downstream consumers don't need to + branch on language. + Handles syntax errors gracefully by storing the error and returning an empty units list. """ - + def __init__(self, code: str, file_path: str): self.code = code self.file_path = file_path self.error: Optional[Exception] = None - + + # Try the lang_parser path first; if it doesn't claim the file (i.e. + # the file is not a registered non-Python source, or lang_parser is + # unavailable in this environment), fall through to the original + # Python ast path so existing behaviour is preserved exactly. + lp_result = self._parse_with_language_parser() + if lp_result is not None: + file_result, tree, error = lp_result + self.tree = tree + self.error = error + self.units: List[CodeUnit] = self._code_units_from_parser_result(file_result) + return + try: self.tree = ast.parse(code) except SyntaxError as e: self.error = e logging.error(f"SyntaxError parsing {file_path}: {e}") self.tree = ast.Module(body=[], type_ignores=[]) - + self.units: List[CodeUnit] = self._extract_units() + + def _parse_with_language_parser(self): + """Attempt to parse ``self.code`` via ``lang_parser``. + + Returns ``(file_result, tree, error)`` on success, or ``None`` if + the file is not recognised by ``lang_parser`` (callers should then + fall back to the original ast path). Never raises. + """ + try: + from lang_parser import get_parser_for_file + except ImportError: + return None + + parser = get_parser_for_file(self.file_path) + if parser is None: + return None + # For Python, lang_parser delegates back to ast, but downstream code + # (notably ``_extract_units``) relies on ``self.tree`` being the raw + # ast.Module. Keep Python on the original path to avoid divergence. + try: + from lang_parser import detect_language + if detect_language(self.file_path) == "python": + return None + except ImportError: + return None + + if hasattr(parser, "parse_file_with_ast"): + return parser.parse_file_with_ast(self.file_path, self.code) + + file_result = parser.parse_file(self.file_path, self.code) + error = Exception(file_result.syntax_error) if file_result.syntax_error else None + empty_tree = ast.Module(body=[], type_ignores=[]) + return file_result, empty_tree, error + + # Unit kinds that other languages use to express something the rest + # of the encoder pipeline treats as "a class": Go struct / interface, + # Rust struct / enum / trait, C/C++ struct / class. Normalising them + # to ``"class"`` here lets semantic_parsing.py's class-vs-function + # grouping (which is hard-coded to ``unit_type == "class"``) pick + # them up without having to learn each parser's per-language taxonomy. + # The original kind is preserved as ``extra["lp_kind"]`` so callers + # that care (e.g. RPG rendering) can still recover it. + _LP_CLASS_LIKE_KINDS = frozenset({ + "struct", "interface", "enum", "trait", + }) + + def _code_units_from_parser_result(self, file_result) -> List["CodeUnit"]: + """Adapt ``LPFileResult`` units into the ``CodeUnit`` shape. + + ``unit_type == "file"`` entries are skipped (no Python-side equivalent). + ``unit_type`` values listed in :attr:`_LP_CLASS_LIKE_KINDS` are + rewritten to ``"class"`` (with the original kind kept in + ``extra["lp_kind"]``) so downstream code that pivots on + ``unit_type == "class"`` picks them up. + The ast-node slot is populated from ``unit.extra['ast_node']`` when + the parser provides one; otherwise the raw source slice is used so + downstream code-snippet building still works. + """ + units: List[CodeUnit] = [] + for unit in file_result.units: + if unit.unit_type == "file": + continue + node = unit.extra.get("ast_node") or unit.code + extra = dict(unit.extra) + extra.setdefault("language", unit.language) + extra.setdefault("line_start", unit.line_start) + extra.setdefault("line_end", unit.line_end) + normalised_type = unit.unit_type + if normalised_type in self._LP_CLASS_LIKE_KINDS: + extra.setdefault("lp_kind", unit.unit_type) + normalised_type = "class" + units.append(CodeUnit( + unit.name, node, normalised_type, self.file_path, + unit.parent, extra=extra, + )) + return units def _extract_units(self) -> List[CodeUnit]: """Extract all code units from the AST.""" @@ -645,6 +741,27 @@ def ensure_class_headers_for_partial_methods( result.append(cls_unit) return result + def _language_for_units(self, units: List[CodeUnit]) -> Optional[str]: + """Best-effort language for a homogeneous-by-file unit list. + + First looks at ``unit.extra['language']`` (set by lang_parser); if + absent, falls back to detecting from the first ``file_path``. Used + to skip Python-only ast helpers when the snippet is e.g. Go / TS. + """ + for unit in units: + language = unit.extra.get("language") + if language: + return language + for unit in units: + if not unit.file_path: + continue + try: + from lang_parser import detect_language + return detect_language(unit.file_path) + except ImportError: + return None + return None + def generate_code_snippet( self, source_code: str, @@ -679,14 +796,21 @@ def generate_code_snippet( if cls_unit and cls_unit.lineno: keep[cls_unit.lineno - 1] = True - # 2) Import / assignment lines + # 2) Import / assignment lines (Python only — non-Python sources + # come from lang_parser which already populates the import + # unit's line range, so the import lines are kept via step 1). if keep_imports or keep_assignments: - tree = ast.parse(source_code) - for node in tree.body: - if keep_imports and isinstance(node, (ast.Import, ast.ImportFrom)): - keep[node.lineno - 1] = True - if keep_assignments and isinstance(node, ast.Assign): - keep[node.lineno - 1] = True + language = self._language_for_units(units) + if language in (None, "python"): + try: + tree = ast.parse(source_code) + for node in tree.body: + if keep_imports and isinstance(node, (ast.Import, ast.ImportFrom)): + keep[node.lineno - 1] = True + if keep_assignments and isinstance(node, ast.Assign): + keep[node.lineno - 1] = True + except SyntaxError: + pass # 3) Adjacent blank lines near core lines core_idx = {i for i, k in enumerate(keep) if k} @@ -751,10 +875,15 @@ def build( keep_assignments=keep_assignments, with_lineno=with_lineno, ) + try: + from lang_parser import markdown_fence_for_path + fence = markdown_fence_for_path(file_path) + except ImportError: + fence = "python" if with_file_path: - sections.append(f"```python\n## File Path: {file_path}\n\n{body}\n```") + sections.append(f"```{fence}\n## File Path: {file_path}\n\n{body}\n```") else: - sections.append(f"```python\n## Tool Block\n\n{body}\n```") + sections.append(f"```{fence}\n## Tool Block\n\n{body}\n```") return "\n\n".join(sections) def build_file_map( diff --git a/CoderMind/scripts/rpg/dep_graph.py b/CoderMind/scripts/rpg/dep_graph.py index da2cbe4..b88b2df 100644 --- a/CoderMind/scripts/rpg/dep_graph.py +++ b/CoderMind/scripts/rpg/dep_graph.py @@ -21,17 +21,21 @@ import hashlib import logging import os +import posixpath from collections import defaultdict from pathlib import Path, PurePosixPath from typing import Any, Callable, Dict, List, Optional, Tuple import networkx as nx +import lang_parser from .models import EdgeType, NodeType from common.utils import ( normalize_path, is_test_file, get_node_range_robust, + extract_source_by_lines, + path_has_skip_dir, ) logger = logging.getLogger(__name__) @@ -64,15 +68,10 @@ def _exclude_irrelevant_for_build(file_id: str) -> bool: ".mp3", ".mp4", ".zip", ".tar", ".gz", ".pdf", ".docx", ".xlsx", ".pptx", ".exe", ".dll", ".so", ".o", ".a", + ".rlib", ".rmeta", ".log", } - PATH_BLACKLIST = { - ".git", "__pycache__", "node_modules", - ".venv", "venv", ".idea", ".vscode", - ".pytest_cache", ".mypy_cache", "build", "dist", - } - FILE_BLACKLIST = { "Makefile", "CMakeLists.txt", "Dockerfile", "LICENSE", "LICENSE.txt", @@ -85,7 +84,7 @@ def _exclude_irrelevant_for_build(file_id: str) -> bool: if path_obj.suffix.lower() in EXT_BLACKLIST: return False - if any(part in PATH_BLACKLIST for part in path_obj.parts): + if path_has_skip_dir(file_id): return False if path_obj.name in FILE_BLACKLIST: @@ -103,15 +102,17 @@ def _exclude_irrelevant_for_build(file_id: str) -> bool: def _exclude_irrelevant_for_parse(file_id: str) -> bool: """Default filter for ``DependencyGraph.parse()``. - Returns ``True`` if the file should be **parsed** for AST analysis. + Returns ``True`` if the file should be parsed for source analysis. + Accepts any source language registered with ``lang_parser`` (Python, + Go, TypeScript / JavaScript, C / C++, Rust, ...). """ - if not file_id.endswith(".py"): + if not lang_parser.is_supported_source(file_id): return False - path_lower = file_id.lower() - if is_test_file(path_lower): + if lang_parser.is_test_file(file_id): return False + path_lower = file_id.lower() EXCLUDE_FILES = { "setup.py", "__main__.py", "conftest.py", "requirements.py", @@ -120,10 +121,6 @@ def _exclude_irrelevant_for_parse(file_id: str) -> bool: if any(path_lower.endswith(f"/{f}") for f in EXCLUDE_FILES): return False - base_name = os.path.basename(file_id) - if base_name.startswith("test_") or base_name.endswith("_test.py"): - return False - return True @@ -177,6 +174,11 @@ class DependencyGraph: def __init__(self, repo_dir: str): self.repo_dir = repo_dir self.G: nx.MultiDiGraph = nx.MultiDiGraph() + # Lazy cache of the Go module path declared in go.mod; resolved on + # first cross-package import lookup so we don't pay the I/O on + # Python-only repositories. + self._go_module_path_cache: str | None = None + self._go_module_path_loaded = False # Filtered subgraph views self.G_tree = nx.subgraph_view( @@ -210,6 +212,33 @@ def __init__(self, repo_dir: str): "inherits", "inherited_by", }) + # ------------------------------------------------------------------ + # lang_parser integration constants + # ------------------------------------------------------------------ + # Unit types from lang_parser that should map to a CLASS-like RPG node. + _LP_CLASS_LIKE_UNIT_TYPES = frozenset({"class", "struct", "interface", "enum", "trait"}) + # lang_parser unit_type -> RPG NodeType. + _LP_NODE_TYPES = { + "class": NodeType.CLASS, + "struct": NodeType.CLASS, + "enum": NodeType.CLASS, + "interface": NodeType.INTERFACE, + "trait": NodeType.INTERFACE, + "function": NodeType.FUNCTION, + "method": NodeType.METHOD, + "import": NodeType.IMPORT, + "package": NodeType.PACKAGE, + } + # Extensions to try when resolving an ECMAScript import that omitted the suffix. + _TS_JS_IMPORT_EXTENSIONS = (".ts", ".tsx", ".js", ".jsx") + # File extensions belonging to the C-family for include-graph heuristics. + _C_FAMILY_EXTENSIONS = frozenset({".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx"}) + # C-family header extensions. A function call resolves to its definition in + # an implementation file, not a prototype/declaration in a header. + _C_HEADER_EXTENSIONS = frozenset({".h", ".hpp", ".hh", ".hxx"}) + # File extensions belonging to Rust crates. + _RUST_EXTENSIONS = frozenset({".rs"}) + @staticmethod def _extract_signature(node: ast.AST, is_method: bool = False) -> str: """Extract a human-readable signature string from a FunctionDef AST node. @@ -422,22 +451,49 @@ def build(self, filter_func: Callable[[str], bool] = _exclude_irrelevant_for_bui # ------------------------------------------------------------------ def parse(self, filter_func: Callable[[str], bool] = _exclude_irrelevant_for_parse) -> None: - """Parse Python files to extract code structure and dependency edges.""" - # 1) Parse files for class/function/method definitions + """Parse supported source files to extract structure and dependency edges. + + Python files keep the original ``ast``-based path (unchanged). Other + supported languages (Go, TypeScript / JavaScript, C / C++, Rust, ...) + are routed through ``lang_parser`` and adapted into the same RPG + node / edge schema via the ``_parse_lp_*`` helpers. + """ + # 1) Parse files for code units. Python keeps the exact AST helper path. logger.info("Parsing DependencyGraph to extract code structure") + lp_results: list[tuple[str, lang_parser.LPFileResult]] = [] for file_id, attrs in list(self.G.nodes(data=True)): if attrs.get("type") != NodeType.FILE or not filter_func(file_id): continue content = self._read_code(file_id) + language = lang_parser.detect_language(file_id) + if language == "python": + try: + tree = ast.parse(content) + except SyntaxError as e: + logger.debug("[parse:skip] %s: %s", file_id, e) + continue + + self.G.nodes[file_id]["ast"] = tree + self.G.nodes[file_id]["language"] = "python" + self._parse_file(file_id, tree, content) + continue + + if language is None: + continue + try: - tree = ast.parse(content) - except SyntaxError as e: + result = lang_parser.parse_file(file_id, content) + except lang_parser.NotSupported as e: logger.debug("[parse:skip] %s: %s", file_id, e) continue + self._parse_lp_file_result(file_id, result) + lp_results.append((file_id, result)) - self.G.nodes[file_id]["ast"] = tree - self._parse_file(file_id, tree, content) + # Second pass: invoke edges for lang_parser results need the full unit + # registry from the first pass so cross-file calls can be resolved. + for file_id, result in lp_results: + self._parse_lp_invoke_dependencies(file_id, result) logger.info( "Finished parsing code structure, now has %d nodes and %d edges", self.G.number_of_nodes(), @@ -607,16 +663,44 @@ def add_file( content_hash=_hash_content(content), ) - # Parse AST + code units. SyntaxError is non-fatal — we keep the - # file node (with the hash) so a later fix-up commit will trigger - # a real re-parse. + # Dispatch on language exactly like :meth:`parse` does: Python + # files use ast + ``_parse_file``; other supported languages + # route through lang_parser + ``_parse_lp_file_result``. Without + # this, an incremental ``update_files`` call (which goes through + # ``add_file``) silently dropped all units / imports / invokes + # for non-Python files because ``ast.parse`` raised SyntaxError + # and we returned early. + try: + language = lang_parser.detect_language(nid) + except Exception: # pragma: no cover - defensive + language = None + + if language == "python" or language is None: + try: + tree = ast.parse(content) + except SyntaxError as exc: + logger.debug("[add_file:syntax] %s: %s", nid, exc) + return True + self.G.nodes[nid]["ast"] = tree + self.G.nodes[nid]["language"] = "python" + self._parse_file(nid, tree, content) + return True + try: - tree = ast.parse(content) - except SyntaxError as exc: - logger.debug("[add_file:syntax] %s: %s", nid, exc) + result = lang_parser.parse_file(nid, content) + except lang_parser.NotSupported as exc: + logger.debug("[add_file:lp_unsupported] %s: %s", nid, exc) return True - self.G.nodes[nid]["ast"] = tree - self._parse_file(nid, tree, content) + except Exception as exc: # pragma: no cover - defensive + logger.debug("[add_file:lp_error] %s: %s", nid, exc) + return True + self._parse_lp_file_result(nid, result) + # NOTE: ``_parse_lp_invoke_dependencies`` is deliberately NOT + # called here. Cross-file invoke resolution needs the global + # unit registry to be in its final state, which is only + # guaranteed after ``_rerun_semantic_passes`` runs at the end + # of ``update_files``. That helper will re-discover this file's + # language attr and replay both the file-result + invoke pass. return True def _wipe_semantic_edges(self) -> int: @@ -655,12 +739,22 @@ def _rerun_semantic_passes(self) -> None: lockstep with full rebuild. Callers must ensure ``ast`` attrs are fresh on any file they've edited (the public entry points :meth:`add_file` and :meth:`update_files` do this). + + Non-Python files (Go / TS / C / Rust / ...) don't carry an + ``ast`` attr (lang_parser produces an :class:`LPFileResult`, + not a :class:`ast.Module`), and their import + invoke edges are + produced by ``_parse_lp_file_result`` / ``_parse_lp_invoke_ + dependencies``. Re-run those passes too — using the cached + ``code`` attr — otherwise an incremental ``update_files`` call + wipes the semantic edges and never restores them for non-Python + sources. """ # Pass 2 prereq: alias maps must exist on every code node before # any _parse_imports call (it propagates aliases bidirectionally). for nid in list(self.G_code.nodes): self._init_alias_map(nid) + # ----- Python ast-based passes (unchanged behaviour) ----- # Pass 2: imports alias_links: nx.DiGraph = nx.DiGraph() for nid, attrs in list(self.G_code.nodes(data=True)): @@ -682,6 +776,35 @@ def _rerun_semantic_passes(self) -> None: if tree is not None: self._parse_invokes(nid, tree) + # ----- lang_parser passes for non-Python file nodes ----- + # Walk file nodes that carry a non-Python language and re-parse + # them via lang_parser. ``_parse_lp_file_result`` rebuilds + # IMPORTS edges (and keeps CONTAINS structure idempotently), + # and ``_parse_lp_invoke_dependencies`` rebuilds INVOKES edges + # using the unit registry the first pass populates. Two-pass + # ordering mirrors :meth:`parse`. + lp_results: List[Tuple[str, "lang_parser.LPFileResult"]] = [] + for nid, attrs in list(self.G.nodes(data=True)): + if attrs.get("type") != NodeType.FILE: + continue + language = attrs.get("language") + if not language or language == "python": + continue + content = attrs.get("code") + if not content: + continue + try: + result = lang_parser.parse_file(nid, content) + except lang_parser.NotSupported: + continue + except Exception as exc: # pragma: no cover - defensive + logger.debug("[rerun_semantic_passes:lp_skip] %s: %s", nid, exc) + continue + self._parse_lp_file_result(nid, result) + lp_results.append((nid, result)) + for nid, result in lp_results: + self._parse_lp_invoke_dependencies(nid, result) + def update_files( self, file_rels: List[str], @@ -879,6 +1002,7 @@ def _extract_from_control_flow( ast=node, start_line=start, end_line=end, + code=extract_source_by_lines(source_code, start, end), signature=self._extract_signature(node, is_method=False), ) elif isinstance(node, ast.ClassDef): @@ -894,6 +1018,7 @@ def _extract_from_control_flow( ast=node, start_line=start, end_line=end, + code=extract_source_by_lines(source_code, start, end), ) # Methods inside the class for body_node in node.body: @@ -909,6 +1034,7 @@ def _extract_from_control_flow( ast=body_node, start_line=s2, end_line=e2, + code=extract_source_by_lines(source_code, s2, e2), signature=self._extract_signature(body_node, is_method=True), ) # Recurse into nested control flow @@ -946,6 +1072,7 @@ def get_range(node: ast.AST) -> Tuple[int, int]: ast=node, start_line=start, end_line=end, + code=extract_source_by_lines(source_code, start, end), signature=self._extract_signature(node, is_method=False), ) @@ -962,6 +1089,7 @@ def get_range(node: ast.AST) -> Tuple[int, int]: ast=node, start_line=start, end_line=end, + code=extract_source_by_lines(source_code, start, end), ) for body in node.body: @@ -978,6 +1106,7 @@ def get_range(node: ast.AST) -> Tuple[int, int]: ast=body, start_line=start2, end_line=end2, + code=extract_source_by_lines(source_code, start2, end2), signature=self._extract_signature(body, is_method=True), ) @@ -1728,22 +1857,43 @@ def reparse_ast(self, filter_func: Callable[[str], bool] = _exclude_irrelevant_f """Reparse source code to restore AST and code structure. Must be called after :meth:`from_dict` to reconstruct AST objects and - semantic edges that are not serialized. + semantic edges that are not serialized. Mirrors the language + dispatch in :meth:`parse`: Python files keep the original ast path, + non-Python files re-run through ``lang_parser``. """ + lp_results: list[tuple[str, lang_parser.LPFileResult]] = [] for nid, attrs in list(self.G.nodes(data=True)): if attrs.get("type") != NodeType.FILE or not filter_func(nid): continue content = self._read_code(nid) - try: - tree = ast.parse(content) - except SyntaxError: + language = lang_parser.detect_language(nid) + if language == "python": + try: + tree = ast.parse(content) + except SyntaxError: + continue + + self.G.nodes[nid]["ast"] = tree + self.G.nodes[nid]["language"] = "python" + + # Rebuild functions / classes nodes + self._parse_file(nid, tree, content) continue - self.G.nodes[nid]["ast"] = tree + if language is None: + continue - # Rebuild functions / classes nodes - self._parse_file(nid, tree, content) + try: + result = lang_parser.parse_file(nid, content) + except lang_parser.NotSupported: + continue + self._parse_lp_file_result(nid, result) + lp_results.append((nid, result)) + + # Second pass for lang_parser invokes (needs full unit registry). + for nid, result in lp_results: + self._parse_lp_invoke_dependencies(nid, result) # Re-run import / invoke / inherit pass alias_links: nx.DiGraph = nx.DiGraph() @@ -1766,3 +1916,866 @@ def reparse_ast(self, filter_func: Callable[[str], bool] = _exclude_irrelevant_f self._parse_invokes(nid, node_ast) logger.info("AST re-parsed & semantic edges reconstructed") + + # ------------------------------------------------------------------ + # LPFileResult parsing for non-Python languages + # ------------------------------------------------------------------ + + def _parse_lp_file_result( + self, + file_id: str, + result: lang_parser.LPFileResult, + *, + include_dependencies: bool = True, + ) -> None: + """Add graph nodes and dependency edges from a language-parser result.""" + file_attrs = self.G.nodes[file_id] + file_attrs["language"] = result.language + file_attrs["unit_type"] = "file" + if result.syntax_error is not None: + file_attrs["syntax_error"] = result.syntax_error + else: + file_attrs.pop("syntax_error", None) + + file_code_path = file_attrs.get("code_path", "") + unit_node_ids: dict[int, str] = {} + class_like_by_name: dict[str, str] = {} + import_nodes_by_line: dict[int, str] = {} + import_nodes_by_module: dict[str, str] = {} + + for index, unit in enumerate(result.units): + unit_id = self._lp_unit_node_id(file_id, unit, index) + unit_node_ids[index] = unit_id + if unit.unit_type in self._LP_CLASS_LIKE_UNIT_TYPES and unit.name: + class_like_by_name[unit.name] = unit_id + + for index, unit in enumerate(result.units): + unit_id = unit_node_ids[index] + parent_id = file_id + if unit.parent: + parent_id = class_like_by_name.get(unit.parent, file_id) + + self._add_node( + unit_id, + type=self._lp_node_type(unit.unit_type), + name=unit.name or self._lp_fallback_unit_name(unit, index), + parent_id=parent_id, + **self._lp_unit_node_attrs(unit, file_code_path), + ) + + if unit.unit_type == "import": + if unit.line_start is not None: + import_nodes_by_line.setdefault(unit.line_start, unit_id) + for module in (unit.name, unit.extra.get("module"), unit.extra.get("import_path")): + if module: + import_nodes_by_module[module] = unit_id + + if include_dependencies: + self._parse_lp_dependencies(file_id, result, import_nodes_by_line, import_nodes_by_module) + + def _lp_node_type(self, unit_type: str) -> NodeType: + """Map a language-parser unit type onto an existing graph node type.""" + return self._LP_NODE_TYPES.get(unit_type, NodeType.MODULE) + + def _lp_unit_node_id(self, file_id: str, unit: lang_parser.LPCodeUnit, index: int) -> str: + """Build a deterministic graph node ID for a language-parser code unit.""" + line = unit.line_start if unit.line_start is not None else index + 1 + if unit.unit_type == "import": + import_index = (unit.extra or {}).get("import_index") + if import_index is not None: + return normalize_path(f"{file_id}:import:{line}:{import_index}") + return normalize_path(f"{file_id}:import:{line}") + if unit.unit_type == "package": + return normalize_path(f"{file_id}:package:{line}") + if unit.parent and unit.name: + return normalize_path(f"{file_id}:{unit.parent}.{unit.name}") + if unit.name: + return normalize_path(f"{file_id}:{unit.name}") + return normalize_path(f"{file_id}:{unit.unit_type}:{line}") + + def _lp_fallback_unit_name(self, unit: lang_parser.LPCodeUnit, index: int) -> str: + line = unit.line_start if unit.line_start is not None else index + 1 + return f"{unit.unit_type}:{line}" + + def _lp_unit_node_attrs(self, unit: lang_parser.LPCodeUnit, file_code_path: str) -> dict[str, Any]: + extra = dict(unit.extra or {}) + attrs: dict[str, Any] = { + "language": unit.language, + "unit_type": unit.unit_type, + "start_line": unit.line_start, + "end_line": unit.line_end, + "code": unit.code, + "code_path": file_code_path, + "extra": extra, + } + for key, value in extra.items(): + attr_key = "import_module" if key == "module" else key + if attr_key in {"module", "name", "type"}: + attr_key = f"unit_{attr_key}" + if attr_key not in attrs: + attrs[attr_key] = value + return attrs + + def _parse_lp_dependencies( + self, + file_id: str, + result: lang_parser.LPFileResult, + import_nodes_by_line: dict[int, str], + import_nodes_by_module: dict[str, str], + ) -> None: + for dependency_index, dep in enumerate(result.dependencies): + if dep.relation == "imports": + self._add_lp_import_edge( + file_id, + dep, + dependency_index, + import_nodes_by_line, + import_nodes_by_module, + ) + elif dep.relation == "contains": + self._add_lp_contains_edge(file_id, dep) + elif dep.relation == "invokes": + self._add_lp_invoke_edge(file_id, dep) + elif dep.relation == "inherits": + self._add_lp_inherit_edge(file_id, dep) + + def _parse_lp_invoke_dependencies(self, file_id: str, result: lang_parser.LPFileResult) -> None: + for dep in result.dependencies: + if dep.relation == "invokes": + self._add_lp_invoke_edge(file_id, dep) + + def _add_lp_import_edge( + self, + file_id: str, + dep: lang_parser.LPDependency, + dependency_index: int, + import_nodes_by_line: dict[int, str], + import_nodes_by_module: dict[str, str], + ) -> None: + src_id = self._resolve_lp_reference(dep.src, file_id) or file_id + target_id = self._resolve_lp_import_destination(file_id, dep.dst, dep) + resolved = target_id is not None + if target_id is None: + target_id = self._ensure_lp_import_placeholder( + file_id, + dep, + dependency_index, + import_nodes_by_line, + import_nodes_by_module, + ) + else: + self._mark_lp_import_unit(dep, import_nodes_by_line, import_nodes_by_module, True, target_id) + + self._add_edge( + src_id, + target_id, + type=EdgeType.IMPORTS, + **self._lp_dependency_edge_attrs(dep, resolved), + ) + + def _add_lp_contains_edge(self, file_id: str, dep: lang_parser.LPDependency) -> None: + src_id = self._resolve_lp_reference(dep.src, file_id) + dst_id = self._resolve_lp_reference(dep.dst, file_id) + if src_id and dst_id: + self._add_edge(src_id, dst_id, type=EdgeType.CONTAINS, **self._lp_dependency_edge_attrs(dep, True)) + + def _add_lp_inherit_edge(self, file_id: str, dep: lang_parser.LPDependency) -> None: + src_id = self._resolve_lp_reference(dep.src, file_id) + dst_id = self._resolve_lp_reference(dep.dst, file_id) + if src_id and dst_id and src_id != dst_id: + self._add_edge(src_id, dst_id, type=EdgeType.INHERITS, **self._lp_dependency_edge_attrs(dep, True)) + + def _add_lp_invoke_edge(self, file_id: str, dep: lang_parser.LPDependency) -> None: + src_id = self._resolve_lp_reference(dep.src, file_id) or file_id + dst_id = self._resolve_lp_invoke_destination(file_id, dep) + if dst_id is None or dst_id == src_id: + return + self._add_edge( + src_id, + dst_id, + type=EdgeType.INVOKES, + **self._lp_dependency_edge_attrs(dep, True), + ) + + def _lp_dependency_edge_attrs(self, dep: lang_parser.LPDependency, resolved: bool) -> dict[str, Any]: + confidence = "resolved" if resolved else (dep.confidence or "unresolved") + attrs: dict[str, Any] = { + "relation": dep.relation, + "symbol": dep.symbol, + "line": dep.line, + "confidence": confidence, + "resolved": resolved, + "extra": dict(dep.extra or {}), + } + if dep.dst is not None: + attrs["import_module"] = dep.dst + if not resolved: + attrs["heuristic"] = True + for key, value in (dep.extra or {}).items(): + attr_key = "unit_type" if key == "type" else key + if attr_key not in attrs and attr_key != "type": + attrs[attr_key] = value + return attrs + + def _resolve_lp_import_destination( + self, + file_id: str, + module: str | None, + dep: lang_parser.LPDependency | None = None, + ) -> Optional[str]: + if not module: + return None + + if self._is_c_family_import(file_id, dep): + if (dep.extra or {}).get("include_style") != "quote": + return None + return self._resolve_c_local_include(file_id, module) + + if self._is_rust_import(dep): + return self._resolve_rust_import(file_id, module, dep) + + direct_id = normalize_path(module) + if direct_id in self.G and self.G.nodes[direct_id].get("type") == NodeType.FILE: + return direct_id + + if module.startswith("."): + return self._resolve_relative_ts_js_import(file_id, module) + + go_target = self._resolve_go_module_import(module) + if go_target is not None: + return go_target + + return None + + def _resolve_relative_ts_js_import(self, file_id: str, module: str) -> Optional[str]: + base_dir = PurePosixPath(file_id).parent.as_posix() + if base_dir == ".": + base_dir = "" + raw_path = posixpath.normpath(posixpath.join(base_dir, module)).removeprefix("./") + extension = posixpath.splitext(raw_path)[1] + + candidates: list[str] = [] + if extension: + candidates.append(raw_path) + else: + candidates.extend(f"{raw_path}{ext}" for ext in self._TS_JS_IMPORT_EXTENSIONS) + candidates.extend(posixpath.join(raw_path, f"index{ext}") for ext in self._TS_JS_IMPORT_EXTENSIONS) + + for candidate in candidates: + candidate_id = normalize_path(candidate) + if candidate_id in self.G and self.G.nodes[candidate_id].get("type") == NodeType.FILE: + return candidate_id + return None + + def _is_c_family_import(self, file_id: str, dep: lang_parser.LPDependency | None) -> bool: + if dep is None: + return False + extra = dep.extra or {} + language = extra.get("language") + import_kind = extra.get("import_kind") + if language in {"c", "cpp"} or import_kind in {"c_include", "cpp_include"}: + return True + return PurePosixPath(file_id).suffix.lower() in self._C_FAMILY_EXTENSIONS and extra.get("include_style") + + def _resolve_c_local_include(self, file_id: str, include_path: str) -> Optional[str]: + base_dir = PurePosixPath(file_id).parent.as_posix() + if base_dir == ".": + base_dir = "" + + candidates = [] + joined = posixpath.join(base_dir, include_path) if base_dir else include_path + candidates.append(posixpath.normpath(joined).removeprefix("./")) + candidates.append(posixpath.normpath(include_path).removeprefix("./")) + + for candidate in candidates: + candidate_id = normalize_path(candidate) + if candidate_id in self.G and self.G.nodes[candidate_id].get("type") == NodeType.FILE: + return candidate_id + + suffix = normalize_path(posixpath.normpath(include_path).removeprefix("./")) + matches = [] + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") != NodeType.FILE: + continue + if PurePosixPath(node_id).suffix.lower() not in self._C_FAMILY_EXTENSIONS: + continue + if node_id == suffix or node_id.endswith(f"/{suffix}"): + matches.append(node_id) + matches.sort() + if len(matches) == 1: + return matches[0] + return None + + def _resolve_lp_invoke_destination(self, file_id: str, dep: lang_parser.LPDependency) -> Optional[str]: + language = (dep.extra or {}).get("language") + if language in {"typescript", "javascript"}: + return self._resolve_ecmascript_invoke(file_id, dep) + if language == "go": + return self._resolve_go_invoke(file_id, dep) + if language in {"c", "cpp"}: + return self._resolve_c_invoke(file_id, dep) + if language == "rust": + return self._resolve_rust_invoke(file_id, dep) + return self._resolve_lp_reference(dep.dst or dep.symbol, file_id) + + def _resolve_ecmascript_invoke(self, file_id: str, dep: lang_parser.LPDependency) -> Optional[str]: + extra = dep.extra or {} + module = extra.get("module") + symbol = extra.get("imported") or dep.symbol or dep.dst + if module: + target_file = self._resolve_lp_import_destination(file_id, module) + if target_file is None: + return None + named_target = self._find_named_unit_in_file(target_file, symbol) + if named_target is not None: + return named_target + if extra.get("kind") == "default": + return self._find_default_ecmascript_export_in_file(target_file) + return None + return self._find_named_unit_in_file(file_id, symbol) + + def _resolve_go_invoke(self, file_id: str, dep: lang_parser.LPDependency) -> Optional[str]: + extra = dep.extra or {} + symbol = dep.symbol or dep.dst + if not symbol: + return None + module = extra.get("module") + if module: + package_file = self._resolve_lp_import_destination(file_id, module) + if package_file is None: + return None + return self._find_go_package_symbol(package_file, symbol) + return self._find_go_package_symbol(file_id, symbol) + + def _resolve_c_invoke(self, file_id: str, dep: lang_parser.LPDependency) -> Optional[str]: + symbol = dep.symbol or dep.dst + if not symbol: + return None + + extra = dep.extra or {} + call_kind = extra.get("call_kind") + qualifier = extra.get("qualifier") + if call_kind == "static" and qualifier: + return self._find_c_static_method(file_id, qualifier, symbol) + if call_kind == "constructor": + same_file_class = self._find_c_symbol_in_file(file_id, symbol, {NodeType.CLASS}) + if same_file_class is not None: + return same_file_class + return self._find_c_directory_symbol(file_id, symbol, {NodeType.CLASS}) + + same_file = self._find_c_symbol_in_file(file_id, symbol, {NodeType.FUNCTION, NodeType.METHOD}) + if same_file is not None: + return same_file + return self._find_c_directory_symbol(file_id, symbol, {NodeType.FUNCTION, NodeType.METHOD}) + + def _is_rust_import(self, dep: lang_parser.LPDependency | None) -> bool: + if dep is None: + return False + extra = dep.extra or {} + import_kind = extra.get("import_kind") or "" + return extra.get("language") == "rust" or import_kind.startswith("rust_") + + def _resolve_rust_import( + self, + file_id: str, + module: str | None, + dep: lang_parser.LPDependency | None = None, + ) -> Optional[str]: + if not module: + return None + extra = dep.extra or {} if dep is not None else {} + import_kind = extra.get("import_kind") + if import_kind == "rust_mod_decl": + return self._resolve_rust_mod_decl(file_id, module) + return self._resolve_rust_path_to_file(file_id, module) + + def _resolve_rust_mod_decl(self, file_id: str, mod_name: str) -> Optional[str]: + file_dir = self._node_parent_dir(file_id) + candidates = [ + posixpath.join(file_dir, f"{mod_name}.rs") if file_dir else f"{mod_name}.rs", + posixpath.join(file_dir, mod_name, "mod.rs") if file_dir else posixpath.join(mod_name, "mod.rs"), + ] + return self._first_existing_rust_file(candidates) + + def _resolve_rust_path_to_file(self, file_id: str, module_path: str | None) -> Optional[str]: + if not module_path: + return None + if module_path.startswith(("std::", "core::", "alloc::")): + return None + if module_path.startswith("crate::"): + return self._resolve_rust_crate_path(file_id, module_path[len("crate::"):]) + if module_path.startswith("self::"): + return self._resolve_rust_self_path(file_id, module_path[len("self::"):]) + if module_path.startswith("super"): + return self._resolve_rust_super_path(file_id, module_path) + + crate_target = self._resolve_rust_crate_path(file_id, module_path) + if crate_target is not None: + return crate_target + return self._resolve_rust_self_path(file_id, module_path) + + def _resolve_rust_crate_path(self, file_id: str, path_after_crate: str) -> Optional[str]: + src_root = self._find_rust_crate_src_root(file_id) + if src_root is None: + return None + return self._resolve_rust_module_path_from_bases(path_after_crate, [src_root]) + + def _resolve_rust_self_path(self, file_id: str, module_path: str) -> Optional[str]: + return self._resolve_rust_module_path_from_bases(module_path, self._rust_self_base_dirs(file_id)) + + def _resolve_rust_super_path(self, file_id: str, module_path: str) -> Optional[str]: + levels = 0 + rest = module_path + while rest == "super" or rest.startswith("super::"): + levels += 1 + rest = "" if rest == "super" else rest[len("super::"):] + base_dir = self._rust_super_base_dir(file_id, levels) + if base_dir is None: + return None + if not rest: + mod_rs = posixpath.join(base_dir, "mod.rs") if base_dir else "mod.rs" + return self._first_existing_rust_file([mod_rs]) + return self._resolve_rust_module_path_from_bases(rest, [base_dir]) + + def _resolve_rust_module_path_from_bases(self, module_path: str, base_dirs: list[str]) -> Optional[str]: + segments = self._rust_path_segments(module_path) + if not segments: + return None + candidates: list[str] = [] + for base_dir in base_dirs: + normalized_base = normalize_path(base_dir) + if normalized_base == ".": + normalized_base = "" + for end in range(len(segments), 0, -1): + partial = posixpath.join(*segments[:end]) + if normalized_base: + candidates.append(posixpath.join(normalized_base, f"{partial}.rs")) + candidates.append(posixpath.join(normalized_base, partial, "mod.rs")) + else: + candidates.append(f"{partial}.rs") + candidates.append(posixpath.join(partial, "mod.rs")) + return self._first_existing_rust_file(candidates) + + def _rust_path_segments(self, module_path: str) -> list[str]: + return [ + segment + for segment in module_path.split("::") + if segment and segment not in {"crate", "self", "super", "*"} + ] + + def _first_existing_rust_file(self, candidates: list[str]) -> Optional[str]: + for candidate in candidates: + candidate_id = normalize_path(posixpath.normpath(candidate).removeprefix("./")) + if candidate_id in self.G and self.G.nodes[candidate_id].get("type") == NodeType.FILE: + return candidate_id + return None + + def _find_rust_crate_src_root(self, file_id: str) -> Optional[str]: + parts = PurePosixPath(file_id).parts[:-1] + for index in range(len(parts) - 1, -1, -1): + if parts[index] == "src": + return "/".join(parts[:index + 1]) + return None + + def _rust_self_base_dirs(self, file_id: str) -> list[str]: + path = PurePosixPath(file_id) + parent = path.parent.as_posix() + if parent == ".": + parent = "" + if path.name == "mod.rs": + return [parent] + stem_dir = posixpath.join(parent, path.stem) if parent else path.stem + return [stem_dir, parent] + + def _rust_super_base_dir(self, file_id: str, levels: int) -> Optional[str]: + if levels <= 0: + return self._node_parent_dir(file_id) + path = PurePosixPath(file_id) + base = path.parent + remaining = levels + if path.name == "mod.rs": + base = base.parent + remaining -= 1 + else: + remaining -= 1 + while remaining > 0: + base = base.parent + remaining -= 1 + base_posix = base.as_posix() + return "" if base_posix == "." else base_posix + + def _resolve_rust_invoke(self, file_id: str, dep: lang_parser.LPDependency) -> Optional[str]: + extra = dep.extra or {} + symbol = dep.symbol or dep.dst + if not symbol: + return None + + if extra.get("call_kind") == "path": + qualifier = extra.get("qualifier") or dep.dst + if not qualifier: + return None + target_file = self._resolve_rust_path_to_file(file_id, qualifier) + if target_file is not None: + target = self._find_rust_symbol_in_file(target_file, symbol, {NodeType.FUNCTION, NodeType.METHOD}) + if target is not None: + return target + if "::" not in qualifier: + return self._find_rust_qualified_in_file(file_id, qualifier, symbol) + return None + + same_file = self._find_rust_symbol_in_file(file_id, symbol, {NodeType.FUNCTION, NodeType.METHOD}) + if same_file is not None: + return same_file + return self._find_rust_crate_function(file_id, symbol) + + def _find_rust_symbol_in_file( + self, + file_id: str, + symbol: str | None, + node_types: set[NodeType], + ) -> Optional[str]: + if not symbol: + return None + direct_id = normalize_path(f"{file_id}:{symbol}") + if direct_id in self.G and self.G.nodes[direct_id].get("type") in node_types: + return direct_id + + matches = [] + prefix = f"{file_id}:" + for node_id, attrs in self.G.nodes(data=True): + if not node_id.startswith(prefix): + continue + if attrs.get("name") != symbol: + continue + if attrs.get("type") in node_types: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _find_rust_qualified_in_file(self, file_id: str, qualifier: str, symbol: str | None) -> Optional[str]: + if not symbol: + return None + qualified_id = normalize_path(f"{file_id}:{qualifier}.{symbol}") + if qualified_id in self.G and self.G.nodes[qualified_id].get("type") == NodeType.METHOD: + return qualified_id + return None + + def _find_rust_crate_function(self, file_id: str, symbol: str | None) -> Optional[str]: + if not symbol: + return None + src_root = self._find_rust_crate_src_root(file_id) + if src_root is None: + return None + matches = [] + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") != NodeType.FUNCTION or attrs.get("name") != symbol: + continue + node_file = node_id.split(":", 1)[0] + if PurePosixPath(node_file).suffix.lower() not in self._RUST_EXTENSIONS: + continue + if node_file == f"{src_root}/lib.rs" or node_file == f"{src_root}/main.rs" or node_file.startswith(f"{src_root}/"): + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _find_c_symbol_in_file(self, file_id: str, symbol: str | None, node_types: set[NodeType]) -> Optional[str]: + if not symbol: + return None + direct_id = normalize_path(f"{file_id}:{symbol}") + if direct_id in self.G and self.G.nodes[direct_id].get("type") in node_types: + return direct_id + + matches = [] + prefix = f"{file_id}:" + for node_id, attrs in self.G.nodes(data=True): + if not node_id.startswith(prefix): + continue + if attrs.get("name") != symbol: + continue + if attrs.get("type") in node_types: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _find_c_static_method(self, file_id: str, qualifier: str, symbol: str | None) -> Optional[str]: + if not symbol: + return None + direct_id = normalize_path(f"{file_id}:{qualifier}.{symbol}") + if direct_id in self.G and self.G.nodes[direct_id].get("type") == NodeType.METHOD: + return direct_id + + file_dir = self._node_parent_dir(file_id) + matches = [] + suffix = f":{qualifier}.{symbol}" + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") != NodeType.METHOD: + continue + if attrs.get("name") != symbol or not node_id.endswith(suffix): + continue + node_file = node_id.split(":", 1)[0] + if PurePosixPath(node_file).suffix.lower() not in self._C_FAMILY_EXTENSIONS: + continue + if self._node_parent_dir(node_file) == file_dir: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _find_c_directory_symbol( + self, + file_id: str, + symbol: str | None, + node_types: set[NodeType], + ) -> Optional[str]: + if not symbol: + return None + file_dir = self._node_parent_dir(file_id) + matches = [] + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") not in node_types: + continue + if attrs.get("name") != symbol: + continue + node_file = node_id.split(":", 1)[0] + if PurePosixPath(node_file).suffix.lower() not in self._C_FAMILY_EXTENSIONS: + continue + if self._node_parent_dir(node_file) == file_dir: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + # A function declared in a header (prototype) and defined in an + # implementation file yields two same-named nodes. The call target is + # the definition, so prefer implementation-file matches over headers. + impl_matches = [ + node_id for node_id in matches + if PurePosixPath(node_id.split(":", 1)[0]).suffix.lower() + not in self._C_HEADER_EXTENSIONS + ] + if len(impl_matches) == 1: + return impl_matches[0] + return None + + def _node_parent_dir(self, node_id: str) -> str: + parent = PurePosixPath(node_id).parent.as_posix() + return "" if parent == "." else parent + + def _find_named_unit_in_file(self, file_id: str, symbol: str | None) -> Optional[str]: + if not symbol: + return None + direct_id = normalize_path(f"{file_id}:{symbol}") + if direct_id in self.G and self.G.nodes[direct_id].get("type") in { + NodeType.CLASS, + NodeType.FUNCTION, + NodeType.METHOD, + }: + return direct_id + + matches = [] + prefix = f"{file_id}:" + for node_id, attrs in self.G.nodes(data=True): + if not node_id.startswith(prefix): + continue + if attrs.get("name") != symbol: + continue + if attrs.get("type") in {NodeType.CLASS, NodeType.FUNCTION, NodeType.METHOD}: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _find_default_ecmascript_export_in_file(self, file_id: str) -> Optional[str]: + candidates: list[str] = [] + default_exports: list[str] = [] + prefix = f"{file_id}:" + for node_id, attrs in self.G.nodes(data=True): + if not node_id.startswith(prefix): + continue + if attrs.get("type") not in {NodeType.CLASS, NodeType.FUNCTION}: + continue + candidates.append(node_id) + if attrs.get("export_default") is True or attrs.get("extra", {}).get("export_default") is True: + default_exports.append(node_id) + + if len(default_exports) == 1: + return default_exports[0] + if default_exports: + return None + if len(candidates) == 1: + return candidates[0] + return None + + def _find_go_package_symbol(self, package_file_id: str, symbol: str | None) -> Optional[str]: + if not symbol: + return None + package_dir = PurePosixPath(package_file_id).parent.as_posix() + if package_dir == ".": + package_dir = "" + + matches = [] + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") not in {NodeType.FUNCTION, NodeType.METHOD}: + continue + if attrs.get("name") != symbol: + continue + node_file = node_id.split(":", 1)[0] + node_dir = PurePosixPath(node_file).parent.as_posix() + if node_dir == ".": + node_dir = "" + if node_dir == package_dir: + matches.append(node_id) + if len(matches) == 1: + return matches[0] + return None + + def _resolve_go_module_import(self, module: str | None) -> Optional[str]: + if not module: + return None + module_path = self._read_go_module_path() + if not module_path: + return None + if module == module_path: + package_dir = "." + elif module.startswith(f"{module_path}/"): + package_dir = module[len(module_path) + 1:] + else: + return None + return self._go_package_representative_file(package_dir) + + def _read_go_module_path(self) -> Optional[str]: + if self._go_module_path_loaded: + return self._go_module_path_cache + self._go_module_path_loaded = True + go_mod = Path(self.repo_dir) / "go.mod" + try: + for line in go_mod.read_text(encoding="utf-8", errors="ignore").splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("//"): + continue + if stripped.startswith("module "): + parts = stripped.split() + if len(parts) >= 2: + self._go_module_path_cache = parts[1] + break + except OSError: + self._go_module_path_cache = None + return self._go_module_path_cache + + def _go_package_representative_file(self, package_dir: str) -> Optional[str]: + normalized_dir = normalize_path(package_dir) + if normalized_dir == ".": + normalized_dir = "" + files = [] + for node_id, attrs in self.G.nodes(data=True): + if attrs.get("type") != NodeType.FILE or not node_id.endswith(".go"): + continue + parent = PurePosixPath(node_id).parent.as_posix() + if parent == ".": + parent = "" + if parent == normalized_dir: + files.append(node_id) + files.sort() + if not files: + return None + if len(files) == 1: + return files[0] + doc_go = posixpath.join(normalized_dir, "doc.go") if normalized_dir else "doc.go" + if doc_go in files: + return doc_go + basename = PurePosixPath(normalized_dir).name if normalized_dir else "" + if basename: + basename_go = posixpath.join(normalized_dir, f"{basename}.go") + if basename_go in files: + return basename_go + return files[0] + + def _ensure_lp_import_placeholder( + self, + file_id: str, + dep: lang_parser.LPDependency, + dependency_index: int, + import_nodes_by_line: dict[int, str], + import_nodes_by_module: dict[str, str], + ) -> str: + node_id = self._find_lp_import_unit(dep, import_nodes_by_line, import_nodes_by_module) + if node_id is None: + suffix = self._lp_dependency_suffix(dep, dependency_index) + node_id = normalize_path(f"{file_id}:import:{suffix}") + self._add_node( + node_id, + type=NodeType.IMPORT, + name=dep.dst or dep.symbol or "unresolved import", + parent_id=file_id, + language=(dep.extra or {}).get("language"), + unit_type="import", + start_line=dep.line, + end_line=dep.line, + code="", + import_module=dep.dst, + extra=dict(dep.extra or {}), + ) + + self._mark_lp_import_unit(dep, import_nodes_by_line, import_nodes_by_module, False, None) + self.G.nodes[node_id].update( + resolved=False, + confidence=dep.confidence or "unresolved", + heuristic=True, + import_module=dep.dst, + ) + return node_id + + def _mark_lp_import_unit( + self, + dep: lang_parser.LPDependency, + import_nodes_by_line: dict[int, str], + import_nodes_by_module: dict[str, str], + resolved: bool, + target_id: Optional[str], + ) -> None: + node_id = self._find_lp_import_unit(dep, import_nodes_by_line, import_nodes_by_module) + if node_id is None or node_id not in self.G: + return + attrs: dict[str, Any] = { + "resolved": resolved, + "confidence": "resolved" if resolved else (dep.confidence or "unresolved"), + "import_module": dep.dst, + } + if resolved: + attrs["resolved_to"] = target_id + else: + attrs["heuristic"] = True + self.G.nodes[node_id].update(attrs) + + def _find_lp_import_unit( + self, + dep: lang_parser.LPDependency, + import_nodes_by_line: dict[int, str], + import_nodes_by_module: dict[str, str], + ) -> Optional[str]: + for module in (dep.dst, dep.symbol, (dep.extra or {}).get("module"), (dep.extra or {}).get("import_path")): + if module and module in import_nodes_by_module: + return import_nodes_by_module[module] + if dep.line is not None and dep.line in import_nodes_by_line: + return import_nodes_by_line[dep.line] + return None + + def _lp_dependency_suffix(self, dep: lang_parser.LPDependency, dependency_index: int) -> str: + if dep.line is not None: + return str(dep.line) + label = dep.dst or dep.symbol or dep.relation or "unknown" + safe = "".join(char if char.isalnum() else "_" for char in label).strip("_") or "unknown" + return f"dep:{dependency_index + 1}:{safe}" + + def _resolve_lp_reference(self, reference: str | None, file_id: str) -> Optional[str]: + if reference is None: + return None + node_id = normalize_path(reference) + if node_id in self.G: + return node_id + local_id = normalize_path(f"{file_id}:{reference}") + if local_id in self.G: + return local_id + return None + diff --git a/CoderMind/scripts/rpg/models.py b/CoderMind/scripts/rpg/models.py index 48db393..5f08258 100644 --- a/CoderMind/scripts/rpg/models.py +++ b/CoderMind/scripts/rpg/models.py @@ -98,6 +98,11 @@ class NodeMetaData: description: str = "" content: str = "" generator: str = "" + # Source language ("python", "go", "typescript", ...). Populated by the + # encoder via lang_parser.detect_language for any FILE / code-entity node + # backed by an on-disk source file. ``None`` for non-code nodes + # (features, directories, etc.) or when detection is unavailable. + language: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return { @@ -106,6 +111,7 @@ def to_dict(self) -> Dict[str, Any]: "content": self.content, "description": self.description, "generator": self.generator, + "language": self.language, } @classmethod @@ -120,6 +126,7 @@ def from_dict(cls, d: Optional[Dict[str, Any]]) -> Optional["NodeMetaData"]: description=d.get("description", ""), content=d.get("content", ""), generator=d.get("generator", ""), + language=d.get("language"), ) @@ -251,7 +258,17 @@ def infer_type_name_from_path(path: str, has_children: bool = False) -> Optional if path.endswith(".py"): return "file" - + + # Non-Python source files (Go / TypeScript / JavaScript / C / C++ / + # Rust, ...). Delegate to lang_parser if available so the inference + # stays in sync with the parser registry. + try: + from lang_parser import is_supported_source + if is_supported_source(path): + return "file" + except ImportError: + pass + return "directory" @@ -435,7 +452,7 @@ def __init__(self, repo_name: str, repo_info: str = "", excluded_files: List[str # Cross-graph mapping (dep_graph ↔ feature graph) self._feature_to_dep_map: Dict[str, List[str]] = {} # feature_node_id -> [dep_node_ids] - self._dep_graph_file: Optional[str] = None # relative path to dep_graph.json (for serialization) + self._dep_graph_file: Optional[str] = None # legacy external dep_graph path # Git sync state — see :meth:`set_git_meta`. ``None`` means the RPG # has never been linked to a git commit (e.g. brand-new RPG produced @@ -2073,18 +2090,32 @@ def rebuild_cross_maps(self) -> None: def save_dep_graph(self, path: str) -> None: """Serialize dep_graph to an independent JSON file. + .. deprecated:: + The dep_graph is now embedded in ``rpg.json`` via + ``RPG.to_dict(include_dep_graph=True)`` (the default), and + ``RPGService.load`` prefers the embedded copy. New code + should rely on ``svc.save(rpg_path)`` to persist both the + tree and the dep_graph in a single file. This helper is + kept so legacy callers and debugging tools that want a + standalone ``dep_graph.json`` snapshot keep working. + Wraps ``DependencyGraph.to_dict()`` with additional metadata (``code_dir``, ``generated_at``) to produce the schema defined in the encoder-decoder integration plan (§3.2). + + Writes are atomic: a crash during ``json.dump`` leaves the + existing ``dep_graph.json`` intact (instead of the truncated + half-file that the previous ``open('w') + json.dump`` pattern + produced when the encoder was killed mid-write). """ if self.dep_graph is None: raise ValueError("No dep_graph attached; call set_dep_graph() first") from datetime import datetime, timezone + from common.rpg_io import atomic_write_rpg raw = self.dep_graph.to_dict(dep_to_rpg_map=self._dep_to_rpg_map) raw["code_dir"] = self._dep_graph_code_dir raw["generated_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S") - with open(str(path), "w", encoding="utf-8") as f: - json.dump(raw, f, ensure_ascii=False, indent=2) + atomic_write_rpg(str(path), raw, ensure_ascii=False, indent=2) @staticmethod def load_dep_graph(path: str) -> "DependencyGraph": @@ -2794,9 +2825,18 @@ def _parse_tree_node(self, node_data: Dict[str, Any], parent_node: Optional[Node return node def save_json(self, path: str, ensure_ascii: bool = False, indent: int = 2): - """Save to JSON file.""" - with open(path, "w", encoding="utf-8") as f: - json.dump(self.to_dict(), f, ensure_ascii=ensure_ascii, indent=indent) + """Save to JSON file. + + Routes through :func:`common.rpg_io.atomic_write_rpg` so a + crash mid-write leaves the previous ``rpg.json`` intact rather + than truncated. Loaders (``safe_load_rpg``) can then fall back + to the last known-good version from the inner-git snapshot. + """ + from common.rpg_io import atomic_write_rpg + atomic_write_rpg( + str(path), self.to_dict(), + ensure_ascii=ensure_ascii, indent=indent, + ) @classmethod def load_json(cls, path: str) -> "RPG": diff --git a/CoderMind/scripts/rpg/path_format.py b/CoderMind/scripts/rpg/path_format.py index f396879..e310614 100644 --- a/CoderMind/scripts/rpg/path_format.py +++ b/CoderMind/scripts/rpg/path_format.py @@ -13,9 +13,8 @@ METHOD : "rel/posix/path.py::Class::method" Disambiguation of kind (function vs class) is in ``NodeMetaData.type_name``, -NOT in the path itself. This avoids the historical duplication of -``::class Foo`` / ``::function bar`` prefixes that drifted into three -incompatible variants across the codebase. +NOT in the path itself. This keeps path strings canonical and avoids +duplicating kind prefixes such as ``::class Foo`` / ``::function bar``. Dep-graph nodes use a related but distinct convention (``"foo.py:Class.method"`` with a single colon and dot separator); the diff --git a/CoderMind/scripts/rpg/service.py b/CoderMind/scripts/rpg/service.py index de3565c..ae8d22b 100644 --- a/CoderMind/scripts/rpg/service.py +++ b/CoderMind/scripts/rpg/service.py @@ -19,6 +19,7 @@ from __future__ import annotations +import logging from pathlib import Path from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple from .models import RPG, Node, Edge, EdgeType, NodeType, NodeMetaData, strip_uuid8, uuid8 @@ -50,16 +51,37 @@ def __init__(self, rpg: RPG): def load(cls, path: str | Path) -> "RPGService": """Load an RPG from a file and create a service instance. - If the RPG has a ``dep_graph_file`` field pointing to an existing - file, the dep_graph is automatically loaded and cross-maps rebuilt. + Read order (single-source-of-truth): + + 1. **Embedded** dep_graph — ``RPG.load_json`` already restored it + from ``data["dep_graph"]`` if present. This is the new + default since the dep_graph rides inside ``rpg.json``. + 2. **External** dep_graph — only consulted when no embedded copy + was found AND the legacy ``_dep_graph_file`` pointer is set + AND the file exists. Emits a single INFO log so the + fall-through is visible during debugging. + + The fall-through path keeps pre-embed-migration workspaces + readable. New encodes never produce a standalone + ``dep_graph.json`` so this path naturally goes cold. """ + _logger = logging.getLogger(__name__) + rpg = RPG.load_json(str(path)) svc = cls(rpg) svc._rpg_dir = Path(path).parent - # Auto-load external dep_graph if configured - if rpg._dep_graph_file: + + if rpg.dep_graph is not None: + # Embedded copy already attached by RPG.load_json — done. + pass + elif rpg._dep_graph_file: dgp = svc._rpg_dir / rpg._dep_graph_file if dgp.exists(): + _logger.info( + "Loading dep_graph from legacy external file %s; " + "next save will embed it inside rpg.json.", + dgp, + ) rpg.dep_graph = RPG.load_dep_graph(dgp) rpg.rebuild_cross_maps() return svc @@ -540,7 +562,7 @@ def sync_from_commit_diff( self, code_dir: str, workspace_root: str, - save_path: str | Path, + save_path: Optional[str | Path] = None, *, file_limit: Optional[int] = None, staged_only: bool = False, @@ -564,9 +586,11 @@ def sync_from_commit_diff( After a successful run, advances ``meta.git`` to the current HEAD (unless ``CMIND_NO_GIT_META=1`` or the workspace isn't a git - repo). The dep_graph is **always** persisted to ``save_path``; - the RPG file itself is saved by the caller (this method only - mutates ``self.rpg``). + repo). When ``save_path`` is provided the dep_graph is also + persisted as a standalone JSON (legacy behaviour preserved for + callers that still want the sidecar); when ``save_path is None`` + the dep_graph lives only in ``self.rpg.dep_graph`` and rides + inside ``rpg.json`` via the caller's ``svc.save(rpg_path)``. Args: code_dir: Absolute path to the directory that ``DependencyGraph`` @@ -574,7 +598,9 @@ def sync_from_commit_diff( workspace_root: Absolute path to the git working tree. Used both to read ``meta.git``'s sibling HEAD and to compute the relative prefix on dep_graph paths. - save_path: Output path for ``dep_graph.json``. + save_path: Optional standalone output path for the dep_graph. + ``None`` is the new default for callers that rely on the + embedded dep_graph in ``rpg.json``. file_limit: Cap on changed-file count before falling back to full. Defaults to :attr:`DEFAULT_INCREMENTAL_FILE_LIMIT`. staged_only: If ``True``, restrict diff to ``git diff --cached`` @@ -596,7 +622,7 @@ def sync_from_commit_diff( ) limit = file_limit if file_limit is not None else self.DEFAULT_INCREMENTAL_FILE_LIMIT - save_path = str(save_path) + save_path = str(save_path) if save_path is not None else None # ── Step 1: read current HEAD (silent-fail outside a git repo) ── current = read_head(workspace_root) @@ -736,7 +762,7 @@ def sync_from_file_list( file_paths: List[str], code_dir: str, workspace_root: str, - save_path: str | Path, + save_path: Optional[str | Path] = None, *, renames: Optional[Dict[str, str]] = None, ) -> Dict: @@ -751,38 +777,42 @@ def sync_from_file_list( responsible for that if they want it. Args: - file_paths: Repo-relative ``.py`` paths to refresh. + file_paths: Repo-relative source paths to refresh (any supported + language: ``.py``/``.go``/``.rs``/``.ts``/``.js``/``.c``/``.cpp``). code_dir / workspace_root: As :meth:`refresh_dep_graph`. - save_path: Output path for ``dep_graph.json``. + save_path: Optional standalone output path for the dep_graph. + ``None`` (default) means the caller relies on the embedded + dep_graph in ``rpg.json`` via a subsequent ``svc.save``. renames: Optional ``{old: new}`` pairs (rare in codegen; codegen doesn't typically rename files). """ + save_path_str = str(save_path) if save_path is not None else None # Lazy bootstrap: codegen may call this on an RPG that doesn't # have a dep_graph yet (very first batch). Fall back to full. if self.rpg.dep_graph is None: self.refresh_dep_graph( code_dir=code_dir, workspace_root=workspace_root, - save_path=str(save_path), + save_path=save_path_str, ) return { "mode": "full", "reason": "no_existing_dep_graph", "dep_nodes": len(self.rpg.dep_graph.G.nodes()), "dep_edges": len(self.rpg.dep_graph.G.edges()), - "save_path": str(save_path), + "save_path": save_path_str, } stats = self._apply_incremental_dep_graph_update( changed_files=list(file_paths), renames=renames or {}, - save_path=str(save_path), + save_path=save_path_str, ) return { "mode": "incremental", "reason": "explicit_file_list", **stats, - "save_path": str(save_path), + "save_path": save_path_str, } def _apply_incremental_dep_graph_update( @@ -790,7 +820,7 @@ def _apply_incremental_dep_graph_update( *, changed_files: List[str], renames: Dict[str, str], - save_path: str, + save_path: Optional[str] = None, ) -> Dict: """Run ``DependencyGraph.update_files`` + rebuild RPG mappings + save. @@ -827,15 +857,21 @@ def _strip(p: str) -> str: self.rpg._dep_to_rpg_map = self.rpg._build_dep_to_rpg_map() self.rpg.rebuild_cross_maps() - save_path_resolved = _Path(save_path).resolve() - self.rpg.save_dep_graph(save_path_resolved) - try: - self.rpg._dep_graph_file = str( - save_path_resolved.relative_to(self._rpg_dir.resolve()) - ) - except ValueError: - # dep_graph.json lives outside the RPG dir — keep absolute path. - self.rpg._dep_graph_file = str(save_path_resolved) + # ``save_path`` is optional in the embedded-dep_graph world: the + # dep_graph rides inside rpg.json via ``RPG.to_dict``, so callers + # that don't need a standalone ``dep_graph.json`` pass ``None`` and + # rely on ``svc.save(rpg_path)`` afterwards. Legacy callers that + # still pass a path keep their standalone artefact. + if save_path is not None: + save_path_resolved = _Path(save_path).resolve() + self.rpg.save_dep_graph(save_path_resolved) + try: + self.rpg._dep_graph_file = str( + save_path_resolved.relative_to(self._rpg_dir.resolve()) + ) + except ValueError: + # dep_graph.json lives outside the RPG dir — keep absolute path. + self.rpg._dep_graph_file = str(save_path_resolved) return { "dep_nodes": len(self.rpg.dep_graph.G.nodes()), diff --git a/CoderMind/scripts/rpg_edit/apply.py b/CoderMind/scripts/rpg_edit/apply.py index 6df82b0..2421576 100644 --- a/CoderMind/scripts/rpg_edit/apply.py +++ b/CoderMind/scripts/rpg_edit/apply.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -"""Apply an EditPlan to RPG feature graph and code, then refresh dep_graph. +"""Apply an EditPlan to RPG feature graph and code. Reads an EditPlan JSON, applies feature_changes to the RPG, applies -code_changes as diffs, refreshes dep_graph, runs related tests, and -outputs a result JSON. Supports rollback on test failure. +code_changes as diffs, refreshes the embedded dep_graph, runs related +tests, and outputs a result JSON. Supports rollback on test failure. """ import argparse diff --git a/CoderMind/scripts/rpg_edit/validate.py b/CoderMind/scripts/rpg_edit/validate.py index 08373ec..e52f7b0 100644 --- a/CoderMind/scripts/rpg_edit/validate.py +++ b/CoderMind/scripts/rpg_edit/validate.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Pre-check for rpg_edit: verify repo_rpg.json and dep_graph.json exist and are loadable.""" +"""Pre-check for rpg_edit inputs and the embedded dependency graph.""" import argparse import json @@ -46,9 +46,12 @@ def main(): has_dep_graph = svc.rpg.dep_graph is not None if not has_dep_graph and not args.dep_graph.exists(): result = {"type": "error", "error_code": "dep_graph_not_found", - "message": f"dep_graph.json not found: {args.dep_graph}. " - "Run `cmind script update_graphs.py sync` " - "to build it from the current code."} + "message": ( + f"rpg.json has no embedded dep_graph and no legacy " + f"standalone dep_graph.json at {args.dep_graph}. " + "Run /cmind.encode to (re)build it; the embedded " + "dep_graph rides inside rpg.json." + )} print(json.dumps(result) if args.json else f"Error: {result['message']}") return 1 diff --git a/CoderMind/scripts/rpg_encoder/check_encode.py b/CoderMind/scripts/rpg_encoder/check_encode.py index 0c5e10b..b4d2033 100644 --- a/CoderMind/scripts/rpg_encoder/check_encode.py +++ b/CoderMind/scripts/rpg_encoder/check_encode.py @@ -23,7 +23,33 @@ if str(_script_dir) not in sys.path: sys.path.insert(0, str(_script_dir)) -from common.paths import RPG_FILE # noqa: E402 +from common.paths import RPG_FILE, WORKSPACE_ROOT # noqa: E402 + + +def _cwd_workspace_rpg_path() -> Path: + """Return the workspace-local RPG path nearest to the current cwd.""" + cwd = Path.cwd().resolve() + for candidate in [cwd, *cwd.parents]: + cmind_dir = candidate / ".cmind" + if cmind_dir.is_dir(): + return cmind_dir / "data" / "rpg.json" + return cwd / ".cmind" / "data" / "rpg.json" + + +def _rpg_path_candidates() -> list[Path]: + """Return RPG paths to probe, ordered by preferred storage layout.""" + cwd = Path.cwd().resolve() + workspace_root = Path(WORKSPACE_ROOT).resolve() + workspace_local = _cwd_workspace_rpg_path() + candidates: list[Path] = [] + try: + in_import_workspace = cwd.is_relative_to(workspace_root) + except ValueError: + in_import_workspace = False + if in_import_workspace: + candidates.append(Path(RPG_FILE)) + candidates.append(workspace_local) + return candidates def load_json(path: Path) -> Dict[str, Any] | None: @@ -87,7 +113,8 @@ def _count_tree_nodes(node: Dict[str, Any]) -> int: def check_encode() -> Dict[str, Any]: """Check encode state and return a result dict.""" - rpg_path = Path(RPG_FILE) + candidates = _rpg_path_candidates() + rpg_path = next((path for path in candidates if path.exists()), candidates[0]) # Case 1: RPG file does not exist → init if not rpg_path.exists(): diff --git a/CoderMind/scripts/rpg_encoder/prompts/encoding_prompts.py b/CoderMind/scripts/rpg_encoder/prompts/encoding_prompts.py index 49f2995..ada5fba 100644 --- a/CoderMind/scripts/rpg_encoder/prompts/encoding_prompts.py +++ b/CoderMind/scripts/rpg_encoder/prompts/encoding_prompts.py @@ -50,7 +50,7 @@ You are an expert in large-scale software repository auditing. ## Goal -Exclude Python paths that clearly do NOT contribute to core library logic or functionality. +Exclude paths that clearly do NOT contribute to core library logic or functionality. ## Key Policy - Default: **keep code unless exclusion is obvious** @@ -58,10 +58,8 @@ - Err on the side of keeping — conservative filtering ## Scope -Consider only: -1) `.py` files -2) Directories containing `.py` files -Ignore folders with no `.py`. +Consider only source files in supported languages and the directories that +contain them. Ignore folders with no recognised source files. ## Exclude when it is obvious the content is non-core Examples of clearly non-functional areas: @@ -88,7 +86,7 @@ ``` path/to/excluded_dir/ -some/other/irrelevant.py +some/other/irrelevant third_party/ tests/ ... @@ -97,7 +95,7 @@ """ ANALYZE_DATA_FLOW = """ -You are a system architect tasked with EXTRACTING the inter-subtree (functional area) data flows for a Python repository, based solely on the provided context. +You are a system architect tasked with EXTRACTING the inter-subtree (functional area) data flows for a source repository, based solely on the provided context. ## Task From the repository context below, infer a directed data-flow graph between functional subtrees. Each edge represents a data object moving from one subtree to another. @@ -123,8 +121,8 @@ ### Data typing guidance - The "data_type" field can be: - - a single precise type string, e.g. "pandas.DataFrame" - - OR an array of alternatives, e.g. ["pandas.DataFrame", "pyarrow.Table"] to indicate acceptable forms. + - a single precise type string, e.g. "UserRecord" + - OR an array of alternatives, e.g. ["UserRecord", "User"] to indicate acceptable forms. - Container types are allowed and should be explicit, e.g. "list[Sample]", "dict[str, MetricValue]", "tuple[Header, bytes]". - Prefer consistent, reusable type labels across edges when representing the same logical payload. diff --git a/CoderMind/scripts/rpg_encoder/prompts/parse_prompts.py b/CoderMind/scripts/rpg_encoder/prompts/parse_prompts.py index 50fefe8..8410463 100644 --- a/CoderMind/scripts/rpg_encoder/prompts/parse_prompts.py +++ b/CoderMind/scripts/rpg_encoder/prompts/parse_prompts.py @@ -1,29 +1,30 @@ """Parse Prompt Templates. -LLM prompt templates for semantic feature extraction from Python code. -Adapted for agent-based execution where the agent reads source files -directly rather than receiving code inline. +LLM prompt templates for semantic feature extraction from source code +across any supported language (Python, Go, TypeScript/JavaScript, C/C++, +Rust). Adapted for agent-based execution where the agent reads source +files directly rather than receiving code inline. """ PARSE_CLASS = """ ## Instruction -You are a senior software analyst, tasked with extracting high-level semantic features from Python classes. -You will be given a list of target files and classes. Read each file to understand the implementation, then extract features. +You are a senior software analyst, tasked with extracting high-level semantic features from class-like constructs (classes, structs, interfaces, traits, enums, depending on the language). +You will be given a list of target files and class-like definitions. Read each file to understand the implementation, then extract features. ### Key Goals: -- Complete analysis: Provide a full, semantic feature extraction for all specified classes. -- Exhaustive coverage: Include **every** class and **every** method, including special methods (`__init__`, `__new__`, `__enter__`, `__exit__`), class methods, and static methods. -- Focus on purpose and high-level behavior — what each class represents or manages in the system. +- Complete analysis: Provide a full, semantic feature extraction for all specified definitions. +- Exhaustive coverage: Include **every** class-like construct and **every** method (or equivalent — receiver methods, member functions, associated functions, constructors, destructors, lifecycle hooks). +- Focus on purpose and high-level behavior — what each definition represents or manages in the system. - Summarize what each method is responsible for at a high level, avoiding implementation details. - If multiple definitions share the same method name, output that method name only once and merge their features. ## Feature Extraction Principles: -1. Focus on the purpose and behavior of each class — what it represents or manages. +1. Focus on the purpose and behavior of each class-like construct — what it represents or manages. 2. For methods, describe their main purpose, not the implementation details. -3. Use the class name, its methods, and the surrounding context to infer meaning. -4. If a class serves multiple functions, list multiple features accordingly. -5. Do not fabricate class names or methods that are not in the input. -6. Do not skip any defined method, including special methods (e.g., `__init__`, `__new__`, `__repr__`) and helper methods. +3. Use the name, its methods, and the surrounding context to infer meaning. +4. If a definition serves multiple functions, list multiple features accordingly. +5. Do not fabricate names or methods that are not in the input. +6. Do not skip any defined method, including constructors / destructors / lifecycle hooks and helper methods. ### Feature Naming Rules: 1. Use the "verb + object" format @@ -92,7 +93,7 @@ {{ "DataLoader": {{ - "__init__": {{ + "configure": {{ "initialize data loading configuration": "Configures the loader with the input source and validation defaults." }}, "load_data": {{ @@ -121,7 +122,7 @@ PARSE_FUNCTION = """ ## Instruction You are a senior software analyst. -Your task is to extract high-level semantic features from standalone Python functions. +Your task is to extract high-level semantic features from standalone (module-level) functions across any supported language (Python, Go, TypeScript/JavaScript, C/C++, Rust). You will be given a list of target files and functions. Read each file to understand the implementation, then extract features. ### Key Goals diff --git a/CoderMind/scripts/rpg_encoder/refactor_tree.py b/CoderMind/scripts/rpg_encoder/refactor_tree.py index 238e868..0299def 100644 --- a/CoderMind/scripts/rpg_encoder/refactor_tree.py +++ b/CoderMind/scripts/rpg_encoder/refactor_tree.py @@ -83,6 +83,8 @@ def __init__( skeleton_info: str = "", logger: Optional[logging.Logger] = None, llm_client: Optional[Any] = None, + language: Optional[str] = None, + language_map: Optional[Dict[str, str]] = None, **kwargs, ): self.repo_name = repo_name @@ -91,6 +93,21 @@ def __init__( self.repo_skeleton = repo_skeleton self.skeleton_info = skeleton_info + # Language metadata propagated into every NodeMetaData this + # encoder mints. ``language`` is the repo-wide default; pass + # ``None`` (or omit) to leave ``meta.language`` unset rather + # than guessing. ``language_map`` (path-prefix -> language) + # lets multi-language repos override per-subtree. + # ``_resolve_language(path)`` returns the best match (longest + # matching prefix) or falls back to ``language``. Keys are + # normalised so callers can pass ``"cmd/"`` or ``"cmd"`` and + # either form matches. + self.language: Optional[str] = language + self._language_map: Dict[str, str] = { + self._normalise_lang_prefix(k): v + for k, v in (language_map or {}).items() + } + self.rpg = RPG(repo_name=self.repo_name) if logger: @@ -117,6 +134,33 @@ def _uuid8() -> str: """Short uuid (8-char hex) for node ID generation.""" return uuid.uuid4().hex[:8] + @staticmethod + def _normalise_lang_prefix(prefix: str) -> str: + """Strip trailing ``/`` from a ``language_map`` key for matching.""" + return prefix.rstrip("/") + + def _resolve_language(self, path: Optional[str]) -> Optional[str]: + """Return the language for ``path``. + + Looks up the longest path-prefix match in ``language_map``; + falls back to the repo-wide ``self.language`` (which may itself + be ``None`` if the caller never supplied one). ``None`` / + empty paths return the default. + """ + if not path or not self._language_map: + return self.language + normalised_path = str(path).lstrip("/") + best: Optional[str] = None + best_len = -1 + for prefix, lang in self._language_map.items(): + if not prefix: + continue + if normalised_path == prefix or normalised_path.startswith(prefix + "/"): + if len(prefix) > best_len: + best = lang + best_len = len(prefix) + return best if best is not None else self.language + def step(self, memory: Memory): """Single LLM step: generate, parse solution JSON. @@ -614,6 +658,7 @@ def run( path=file_node_path(file_path), description=f_features.get("_file_summary_", ""), generator="rpg_encoder", + language=self._resolve_language(file_path), ), ) file2node[file_path] = file_node @@ -650,6 +695,7 @@ def run( "", ), generator="rpg_encoder", + language=self._resolve_language(file_path), ), unit=func_unit.key(), ) @@ -675,6 +721,7 @@ def run( "", ), generator="rpg_encoder", + language=self._resolve_language(file_path), ), unit=cls_unit.key(), ) @@ -706,6 +753,7 @@ def run( "", ), generator="rpg_encoder", + language=self._resolve_language(file_path), ), unit=mtd_unit.key(), ) @@ -916,6 +964,17 @@ def refactor_new_files( logger.info("Starting incremental refactor on new files...") + # Pick up the repo's dominant language from the parsed_tree keys + # so every NodeMetaData this incremental pass mints carries the + # correct ``meta.language``. Without this, files added through + # the post-merge hook would silently land with ``meta.language=None`` + # while the initial-encode path (which goes through RPGParser) + # gets it right via :func:`lang_parser.dominant_language`. + from lang_parser import dominant_language as _dominant_language + new_files_lang = _dominant_language(parsed_tree.keys()) + if new_files_lang: + logger.info("Incremental refactor dominant language: %s", new_files_lang) + instance = cls( repo_dir=repo_dir, repo_info=repo_info, @@ -923,6 +982,7 @@ def refactor_new_files( skeleton_info=skeleton_info, repo_name=repo_name, logger=logger, + language=new_files_lang, ) instance.rpg = existing_rpg @@ -998,6 +1058,7 @@ def refactor_new_files( path=file_node_path(file_path), description=current_summary, generator="rpg_encoder", + language=instance._resolve_language(file_path), ), ) instance.rpg.add_node(file_node) @@ -1033,6 +1094,7 @@ def refactor_new_files( desc_key_function(func_name, feature), "" ), generator="rpg_encoder", + language=instance._resolve_language(file_path), ), unit=func_unit.key(), ) @@ -1058,6 +1120,7 @@ def refactor_new_files( desc_key_class(class_name, feat), "" ), generator="rpg_encoder", + language=instance._resolve_language(file_path), ), unit=cls_unit.key(), ) @@ -1086,6 +1149,7 @@ def refactor_new_files( "", ), generator="rpg_encoder", + language=instance._resolve_language(file_path), ), unit=mtd_unit.key(), ) @@ -1478,6 +1542,17 @@ def refactor_modified_files( logger.info("Starting refactor for modified files...") + # Same language-detection as :meth:`refactor_new_files` — keep + # ``meta.language`` correct for files revisited by the + # incremental modified-files path. + from lang_parser import dominant_language as _dominant_language + modified_files_lang = _dominant_language(parsed_tree.keys()) + if modified_files_lang: + logger.info( + "Incremental modified-files dominant language: %s", + modified_files_lang, + ) + instance = cls( repo_dir=repo_dir, repo_info=repo_info, @@ -1485,6 +1560,7 @@ def refactor_modified_files( skeleton_info=skeleton_info, repo_name=repo_name, logger=logger, + language=modified_files_lang, ) instance.rpg = existing_rpg diff --git a/CoderMind/scripts/rpg_encoder/rpg_encoding.py b/CoderMind/scripts/rpg_encoder/rpg_encoding.py index 6ff0017..d3fd508 100644 --- a/CoderMind/scripts/rpg_encoder/rpg_encoding.py +++ b/CoderMind/scripts/rpg_encoder/rpg_encoding.py @@ -31,13 +31,16 @@ SystemMessage, UserMessage, ) +from common.rpg_io import atomic_write_rpg from common.utils import ( + is_skip_dir, exclude_files, normalize_path, parse_code_blocks, parse_solution_output, truncate_by_token, ) +from lang_parser import dominant_language, is_supported_source, is_test_file from rpg import RPG from .prompts import EXCLUDE_FILES, GENERATE_REPO_INFO @@ -119,14 +122,7 @@ def _load_skeleton_from_repo(self) -> Tuple[str, List[str]]: for root, dirs, files in os.walk(self.repo_dir): # Skip hidden dirs and common non-essential dirs - dirs[:] = [ - d for d in dirs - if not d.startswith(".") - and d not in { - "__pycache__", "node_modules", ".git", - ".venv", "venv", "env", - } - ] + dirs[:] = [d for d in dirs if not is_skip_dir(d)] dirs.sort() rel_root = os.path.relpath(root, self.repo_dir) @@ -139,7 +135,7 @@ def _load_skeleton_from_repo(self) -> Tuple[str, List[str]]: rel_path = os.path.join(rel_root, fname) if rel_root else fname rel_path = rel_path.replace("\\", "/") tree_lines.append(rel_path) - if fname.endswith(".py"): + if is_supported_source(rel_path) and not is_test_file(rel_path): valid_files.append(rel_path) skeleton_info = "\n".join(tree_lines) @@ -492,11 +488,31 @@ def parse_rpg_from_repo( ) if save_path: - with open(save_path, "w") as f: - json.dump(final_result, f, indent=4, default=lambda o: o.to_dict() if hasattr(o, 'to_dict') else str(o)) + # Atomic write so a kill mid-dump doesn't truncate the + # intermediate parsed-tree snapshot (resumed runs read it). + atomic_write_rpg( + save_path, final_result, + indent=4, + default=lambda o: o.to_dict() if hasattr(o, 'to_dict') else str(o), + ) self.logger.info("Features parsed: files=%d", len(file2feature)) + # Determine the repo's dominant language from the files actually + # scanned by the skeleton so RefactorTree can stamp every + # NodeMetaData it produces (FILE / CLASS / FUNCTION / METHOD) + # with the correct ``meta.language``. Without this, the + # encoder fell back to its (legacy) "python" default and + # produced misleading meta for Go / Rust / TS / ... repos. + repo_dominant_language = dominant_language(self.valid_files) + if repo_dominant_language: + self.logger.info("Dominant language detected: %s", repo_dominant_language) + else: + self.logger.info( + "Dominant language could not be detected from skeleton; " + "RefactorTree will leave meta.language unset." + ) + # 4) Refactor to RPG refactor_agent = RefactorTree( repo_dir=self.repo_dir, @@ -506,6 +522,7 @@ def parse_rpg_from_repo( repo_name=self.repo_name, logger=self.logger, llm_client=self.llm_client, + language=repo_dominant_language, ) self.logger.info("Refactoring to RPG...") final_rpg, refactor_traj, repo_rpg = refactor_agent.run( @@ -534,8 +551,13 @@ def parse_rpg_from_repo( } if save_path: - with open(save_path, "w") as f: - json.dump(final_result, f, indent=4, default=lambda o: o.to_dict() if hasattr(o, 'to_dict') else str(o)) + # Atomic write of the final encoder result; see the same + # note on the parsed-tree snapshot above. + atomic_write_rpg( + save_path, final_result, + indent=4, + default=lambda o: o.to_dict() if hasattr(o, 'to_dict') else str(o), + ) self.logger.info("RPG refactoring done.") self.logger.info("=== RPG parsing pipeline finished ===") diff --git a/CoderMind/scripts/rpg_encoder/rpg_evolution.py b/CoderMind/scripts/rpg_encoder/rpg_evolution.py index 255e9a6..5b9507e 100644 --- a/CoderMind/scripts/rpg_encoder/rpg_evolution.py +++ b/CoderMind/scripts/rpg_encoder/rpg_evolution.py @@ -28,17 +28,18 @@ import time from typing import Any, Dict, List, Optional, Tuple, Union +from common.rpg_io import atomic_write_rpg from common.utils import ( + is_skip_dir, exclude_files, filter_excluded_files, - is_test_file, normalize_path, ) +from lang_parser import is_supported_source, is_test_file as is_supported_test_file from rpg.code_unit import CodeSnippetBuilder, CodeUnit, ParsedFile from rpg import NodeType, RPG from .refactor_tree import RefactorTree -from .rpg_encoding import RPGParser from .semantic_parsing import ParseFeatures logger = logging.getLogger(__name__) @@ -50,13 +51,15 @@ def _filter_non_test_py_files(path: str) -> bool: - """Return True if *path* is a non-test ``.py`` file. + """Return True if *path* is a parseable, non-test source file. Used as a filter predicate when walking the repository directory. + The function keeps its public name for API stability, but the + predicate accepts any language supported by ``lang_parser``. """ - if not path.endswith(".py"): + if not is_supported_source(path): return False - return not is_test_file(path) + return not is_supported_test_file(path) def _load_skeleton_from_repo( @@ -84,14 +87,7 @@ def _load_skeleton_from_repo( valid_files: List[str] = [] for root, dirs, files in os.walk(repo_dir): - dirs[:] = [ - d for d in dirs - if not d.startswith(".") - and d not in { - "__pycache__", "node_modules", ".git", - ".venv", "venv", "env", - } - ] + dirs[:] = [d for d in dirs if not is_skip_dir(d)] dirs.sort() rel_root = os.path.relpath(root, repo_dir) @@ -330,24 +326,19 @@ def _update_dep_graph_index( ) -> None: """Update the dependency graph and rebuild RPG node index. - Routes through :class:`rpg.service.RPGService` so the dep_graph - is **persisted to disk** (``save_path``) and stays in sync with - ``self.rpg.dep_graph``. The previous implementation called - :meth:`RPG.parse_dep_graph` directly, which only mutated the - in-memory ``rpg.dep_graph`` — leaving ``dep_graph.json`` stale - whenever the encoder wrote ``rpg.json`` separately afterwards. - That drift caused MCP-server / ``update_graphs.py status`` reads - to return inconsistent data after ``/cmind.update_rpg``. + Routes through :class:`rpg.service.RPGService` so the in-memory + dep_graph, dep-to-RPG mappings, and cross maps are refreshed + together. When ``save_path`` is provided, a standalone dep_graph + snapshot is also written for compatibility tooling. Args: rpg: The RPG to attach the rebuilt dep_graph to. repo_dir: Workspace root (which is also the project repo root after the workspace=repo unification). logger: Logger for status output. - save_path: Path where ``dep_graph.json`` should be written. - If ``None``, dep_graph stays in-memory only (legacy - behaviour; preserved for callers that haven't been - migrated yet, but flagged with a warning). + save_path: Optional legacy path where a standalone + ``dep_graph.json`` should be written. If ``None``, the + caller persists the refreshed graph by saving ``rpg.json``. """ logger.info("Updating dependency graph and RPG node index...") try: @@ -382,10 +373,13 @@ def _update_dep_graph_index( dep_count, map_count, save_path, ) else: - logger.warning( - "Dependency graph updated in-memory only (%d nodes, " - "%d mappings) — caller did not provide save_path so " - "dep_graph.json on disk may be stale.", + # The new default: dep_graph rides inside rpg.json (single + # source of truth). No standalone dep_graph.json is + # written from this call; the caller's ``svc.save(rpg)`` + # embeds the in-memory graph via ``RPG.to_dict``. + logger.info( + "Dependency graph updated in-memory (%d nodes, " + "%d mappings); caller embeds into rpg.json on save.", dep_count, map_count, ) except Exception as e: @@ -424,7 +418,7 @@ def _process_add_files( # Build code map for new files only file_code_map: Dict[str, str] = {} for fpath in new_files: - if not fpath.endswith(".py"): + if not is_supported_source(fpath) or is_supported_test_file(fpath): continue if fpath in file_code_map_all: file_code_map[fpath] = file_code_map_all[fpath] @@ -666,7 +660,7 @@ def process_diff( behaviour and will log a warning. Pipeline: - 1. Exclude irrelevant files + 1. Carry forward deterministic exclusions (no per-commit LLM vote) 2. Compute detailed diff (``generate_detailed_diff``) 3. Process additions / deletions / modifications 4. Update dependency graph index @@ -705,21 +699,22 @@ def process_diff( last_excluded_files = last_rpg.excluded_files if last_rpg else [] - # Exclude irrelevant files in current repo - rpg_parser = RPGParser( - repo_dir=cur_repo_dir, - repo_name=repo_name, - logger=logger, - ) - - cur_exclude_files = rpg_parser.exclude_irrelevant_files( - repo_info=repo_info, - max_votes=max_exclude_votes, + # Incremental updates run on every commit (post-commit hook). Exclusion + # is fully deterministic here — no per-commit LLM vote (it cost ~30 + # round-trips per generation and only re-derived paths the rules below + # already cover). ``generate_detailed_diff`` loads both snapshots via + # ``_load_skeleton_from_repo`` (which prunes skip-dirs and keeps only + # supported, non-test source) and re-applies the ``exclude_files`` + # prefix rules itself, so the only thing to carry forward is the + # encode-time exclusion list — which may include LLM-identified + # vendored / third-party paths the prefix rules can't infer. + all_exclude_files = sorted(set(last_excluded_files)) + logger.info( + "Carrying forward %d excluded path(s) from the encode baseline.", + len(all_exclude_files), ) - all_exclude_files = sorted(set(last_excluded_files + cur_exclude_files)) - logger.info("Excluded files for current repo: %d", len(all_exclude_files)) - # Compute detailed diff + # Compute detailed diff (re-applies deterministic exclusion internally) all_diff = generate_detailed_diff( last_repo_dir=last_repo_dir, cur_repo_dir=cur_repo_dir, @@ -734,12 +729,15 @@ def process_diff( "last_rpg": last_rpg, } - # Filter to .py files + # Filter to supported source files (any language registered with lang_parser), + # excluding tests so the encoder doesn't index test code as features. add_files = [ - f for f in all_diff.get("added", {}).keys() if f.endswith(".py") + f for f in all_diff.get("added", {}).keys() + if is_supported_source(f) and not is_supported_test_file(f) ] deleted_files = [ - f for f in all_diff.get("deleted", {}).keys() if f.endswith(".py") + f for f in all_diff.get("deleted", {}).keys() + if is_supported_source(f) and not is_supported_test_file(f) ] modified_result = { f: d @@ -747,7 +745,8 @@ def process_diff( if ( isinstance(d, dict) and any(d.get(k) for k in ("changed", "added", "deleted")) - and f.endswith(".py") + and is_supported_source(f) + and not is_supported_test_file(f) ) } @@ -813,8 +812,11 @@ def process_diff( } if save_path: - with open(save_path, "w", encoding="utf-8") as f: - json.dump(result, f, indent=4) + # Atomic write: ``result`` embeds ``rpg.to_dict()``; a killed + # diff job used to leave a half-truncated artefact that + # downstream consumers (``cmind diff``, debug tools) would + # fail to parse on the next read. + atomic_write_rpg(save_path, result, indent=4) total_time = time.time() - global_start logger.info( diff --git a/CoderMind/scripts/rpg_encoder/run_encode.py b/CoderMind/scripts/rpg_encoder/run_encode.py index 38e30af..d9ed2a0 100644 --- a/CoderMind/scripts/rpg_encoder/run_encode.py +++ b/CoderMind/scripts/rpg_encoder/run_encode.py @@ -25,7 +25,8 @@ if str(_script_dir) not in sys.path: sys.path.insert(0, str(_script_dir)) -from common.paths import RPG_FILE, DEP_GRAPH_FILE, RPG_HTML_FILE, WORKSPACE_ROOT, ensure_cmind_dir # noqa: E402 +from common.paths import RPG_FILE, RPG_HTML_FILE, WORKSPACE_ROOT, ensure_cmind_dir # noqa: E402 +from common.rpg_io import atomic_write_rpg # noqa: E402 from common.trajectory import Trajectory # noqa: E402 @@ -100,35 +101,16 @@ def run_encode( traj.start_step(step_dep.step_id) dep_graph_stats = {} - dep_graph_output = None try: rpg.parse_dep_graph(repo_dir) if rpg.dep_graph: - # Save dep_graph as a standalone file so that: - # 1. rpg.json stays small (feature tree + maps only) - # 2. git hooks can update dep_graph.json independently - # 3. file layout is consistent from first encode onward - dep_graph_output = str(DEP_GRAPH_FILE) - os.makedirs(os.path.dirname(dep_graph_output), exist_ok=True) - dg_dict = rpg.dep_graph.to_dict( - dep_to_rpg_map=rpg._dep_to_rpg_map, - ) - with open(dep_graph_output, "w", encoding="utf-8") as dgf: - json.dump(dg_dict, dgf, indent=2, ensure_ascii=False) - - # Store a relative reference from rpg.json's directory to - # dep_graph.json so the layout is portable. Fall back to - # the absolute path when they live in different trees - # (e.g. user passed --output to a custom location). - rpg_dir = Path(output).resolve().parent - dep_graph_resolved = Path(dep_graph_output).resolve() - try: - rpg._dep_graph_file = str( - dep_graph_resolved.relative_to(rpg_dir) - ) - except ValueError: - rpg._dep_graph_file = str(dep_graph_resolved) - + # The dep_graph is embedded in rpg.json by ``rpg.to_dict()`` + # (the default is ``include_dep_graph=True``), so we no + # longer write a standalone ``dep_graph.json``. Single + # source of truth eliminates the encoder-vs-hook drift + # that used to bite ``RPGService.load`` when the two files + # disagreed. Legacy on-disk ``dep_graph.json`` files keep + # loading via ``RPGService.load``'s compat path. dep_graph_stats = { "dep_nodes": rpg.dep_graph.G.number_of_nodes(), "dep_edges": rpg.dep_graph.G.number_of_edges(), @@ -145,8 +127,12 @@ def run_encode( result_data = rpg.to_dict() - with open(output, "w", encoding="utf-8") as fh: - json.dump(result_data, fh, indent=2, ensure_ascii=False) + # Atomic write of the central pipeline artefact. A killed + # encode used to truncate rpg.json and brick downstream + # stages (skeleton / func_design / code_gen all read it); + # now the previous good rpg.json survives any interrupted + # write. + atomic_write_rpg(output, result_data, indent=2, ensure_ascii=False) output_size = os.path.getsize(output) traj.complete_step(step_save.step_id, { @@ -175,13 +161,21 @@ def run_encode( logger.warning("Failed to generate visualization: %s", viz_exc) traj.fail_step(step_viz.step_id, str(viz_exc)) - # Collect stats — use result_data (serialized) edge count since + serialized_edges = result_data.get("edges", []) + edge_count = len(serialized_edges) if isinstance(serialized_edges, list) else 0 + if edge_count == 0: + try: + edge_count = len(rpg.edges) + except Exception: + edge_count = 0 + + # Collect stats — prefer result_data (serialized) edge count since # to_dict() merges dep-graph semantic edges that aren't in self.edges. stats = { "repo_name": repo_name, "output_path": output, "node_count": len(rpg.nodes), - "edge_count": len(result_data.get("edges", [])), + "edge_count": edge_count, } if viz_output: stats["viz_path"] = viz_output diff --git a/CoderMind/scripts/rpg_encoder/run_update_rpg.py b/CoderMind/scripts/rpg_encoder/run_update_rpg.py index 6d62336..440d2c5 100644 --- a/CoderMind/scripts/rpg_encoder/run_update_rpg.py +++ b/CoderMind/scripts/rpg_encoder/run_update_rpg.py @@ -30,6 +30,7 @@ DEP_GRAPH_FILE, WORKSPACE_ROOT, ) +from common.rpg_io import atomic_write_rpg # noqa: E402 def run_update_rpg( @@ -60,9 +61,9 @@ def run_update_rpg( cur_repo_dir = os.path.abspath(cur_repo_dir) last_repo_dir = os.path.abspath(last_repo_dir) rpg_file = os.path.abspath(rpg_file) - # ``dep_graph_path`` defaults to the standard ``.cmind/data/dep_graph.json`` - # location so that ``run_update_rpg.py`` (CLI) and the pre-commit - # hook agree on a single canonical file. + # ``dep_graph_path`` is a legacy standalone location retained for + # callers that still pass ``--dep-graph``. Normal updates embed the + # refreshed dependency graph in ``rpg.json``. if dep_graph_path is None: dep_graph_path = str(DEP_GRAPH_FILE) else: @@ -114,9 +115,11 @@ def run_update_rpg( pre_commit = (rpg.git_meta or {}).get("head_commit") # === Step 1: LLM-driven feature graph refactor === - # Now threaded with the dep_graph save path so the structural - # refresh inside process_diff actually persists dep_graph.json - # to disk (fixes the legacy ``_update_dep_graph_index`` bug). + # ``dep_graph_save_path=None``: the dep_graph rides inside + # ``rpg.json`` as the single source of truth (embedded by + # ``RPG.to_dict`` and persisted by the ``atomic_write_rpg`` below). + # The legacy standalone ``dep_graph.json`` is no longer produced; + # readers tolerate its absence and use the embedded copy. updated_rpg = RPGEvolution.process_diff( repo_name=repo_name, repo_info=repo_info, @@ -126,7 +129,6 @@ def run_update_rpg( last_rpg=rpg, last_feature_tree=feature_tree, update_dep_graph=True, - dep_graph_save_path=dep_graph_path, max_exclude_votes=max_exclude_votes, ) @@ -171,11 +173,14 @@ def run_update_rpg( except Exception as exc: logger.warning("set_git_meta after update_rpg failed: %s", exc) - # Save updated RPG in the same format as run_encode (rpg.to_dict()) + # Save updated RPG in the same format as run_encode (rpg.to_dict()). + # Atomic write: a kill mid-update used to leave a half-truncated + # rpg.json that bricked every subsequent ``cmind`` invocation; + # ``atomic_write_rpg`` swaps a fully-written ``.tmp`` into + # place so readers always see either the previous good rpg.json + # or the new one. result_data = updated_rpg.to_dict() - - with open(output, "w", encoding="utf-8") as fh: - json.dump(result_data, fh, indent=2, ensure_ascii=False) + atomic_write_rpg(output, result_data, indent=2, ensure_ascii=False) # Collect stats post_nodes = len(updated_rpg.nodes) @@ -227,8 +232,8 @@ def main(): "--dep-graph", default=None, help=( - "Path to write dep_graph.json (default: .cmind/data/dep_graph.json). " - "Must match the path used by the pre-commit sync hook to avoid drift." + "Legacy standalone dep_graph path. Normal updates embed the " + "dependency graph in rpg.json." ), ) parser.add_argument( diff --git a/CoderMind/scripts/rpg_encoder/semantic_parsing.py b/CoderMind/scripts/rpg_encoder/semantic_parsing.py index c6c884f..bf815fe 100644 --- a/CoderMind/scripts/rpg_encoder/semantic_parsing.py +++ b/CoderMind/scripts/rpg_encoder/semantic_parsing.py @@ -41,6 +41,7 @@ normalize_path, parse_solution_output, ) +from lang_parser import is_supported_source, is_test_file from rpg.code_unit import CodeSnippetBuilder, CodeUnit, ParsedFile from rpg.path_format import ( desc_key_class as _desc_key_class, @@ -916,17 +917,17 @@ def parse_repo( if excluded_files is None: excluded_files = [] - # Step 1: Collect valid Python files + # Step 1: Collect valid source files (any language registered with lang_parser) filtered_files = filter_excluded_files( valid_files=self.valid_files, excluded_files=excluded_files ) py_files = [ os.path.join(self.repo_dir, f) for f in filtered_files - if f.endswith(".py") + if is_supported_source(f) and not is_test_file(f) ] - self.logger.info("Total valid Python files to parse: %d", len(py_files)) + self.logger.info("Total valid source files to parse: %d", len(py_files)) file_code_map: Dict[str, str] = {} for file_path in py_files: diff --git a/CoderMind/scripts/rpg_encoder/version_control.py b/CoderMind/scripts/rpg_encoder/version_control.py index 2ac35ba..2a360a1 100644 --- a/CoderMind/scripts/rpg_encoder/version_control.py +++ b/CoderMind/scripts/rpg_encoder/version_control.py @@ -131,8 +131,11 @@ def save_version( "rpg": rpg.to_dict(), } - with open(filepath, "w", encoding="utf-8") as fh: - json.dump(payload, fh, indent=2, ensure_ascii=False) + # Atomic write: a kill mid-save used to leave a truncated history + # snapshot that ``rollback(version=N)`` could not parse. Aligns + # with :meth:`rollback` which already uses ``atomic_write_rpg`` + # for the main rpg.json write. + atomic_write_rpg(Path(filepath), payload, indent=2, ensure_ascii=False) logger.info( "Saved RPG version %d: %s (%s)", @@ -148,7 +151,7 @@ def save_version( return next_version def rollback(self, version: int) -> RPG: - """Restore an RPG from a previously saved version. + """Restore an RPG from a saved version. The restored RPG is also written to the main ``/rpg.json`` file so downstream tools can read it. diff --git a/CoderMind/scripts/rpg_visualize.py b/CoderMind/scripts/rpg_visualize.py index 1d9d69b..9f1b8de 100644 --- a/CoderMind/scripts/rpg_visualize.py +++ b/CoderMind/scripts/rpg_visualize.py @@ -3,15 +3,15 @@ Renders three views: 1. **Feat Graph** — collapsible tree layout (D3.js) from rpg.json -2. **Dep Graph** — collapsible force-directed layout from dep_graph.json - Nodes are grouped by file hierarchy, collapsible at any level. +2. **Dep Graph** — collapsible force-directed layout from the dep_graph + embedded in rpg.json. Nodes are grouped by file hierarchy, collapsible at any level. Edges merge when groups are collapsed. 3. **Mapping** — RPG feature tree (L→R) linked to dep tree (R→L) via _dep_to_rpg_map Default: only the first level (functional areas) is expanded. Usage: - python3 scripts/rpg_visualize.py [rpg.json] [--dep-graph dep_graph.json] [-o output.html] + python3 scripts/rpg_visualize.py [rpg.json] [--dep-graph legacy_dep_graph.json] [-o output.html] """ import argparse @@ -59,7 +59,7 @@ def load_rpg(path: str | Path, dep_graph_path: str | Path | None = None) -> dict if resolved_dep_path: data["dep_graph"] = load_json(resolved_dep_path) elif dep_graph_path: - raise FileNotFoundError(f"dep_graph.json not found: {dep_graph_path}") + raise FileNotFoundError(f"dep_graph override not found: {dep_graph_path}") return data @@ -1879,7 +1879,10 @@ def main(): parser.add_argument("rpg_file", nargs="?", default=str(RPG_FILE), help="Path to rpg.json (default: home-side workspace store at ~/.cmind/workspaces//data/rpg.json)") parser.add_argument("--dep-graph", default=None, - help="Path to dep_graph.json (default: dep_graph_file field or sibling dep_graph.json)") + help=( + "Optional legacy external dep_graph override. " + "By default the embedded dep_graph in rpg.json is used." + )) parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: .html)") args = parser.parse_args() diff --git a/CoderMind/scripts/run_batch.py b/CoderMind/scripts/run_batch.py index 43622aa..bf34e3b 100644 --- a/CoderMind/scripts/run_batch.py +++ b/CoderMind/scripts/run_batch.py @@ -78,6 +78,7 @@ from code_gen.test_runner import ( ensure_dev_venv, ensure_deps_installed, + resolve_test_backend, ) from code_gen.rpg_updater import run_rpg_update @@ -150,6 +151,22 @@ # only needs the sub-agent timeout directly for its argparse default. +def _setup_codegen_environment(repo_path: Path) -> None: + """Prepare the language-specific codegen environment.""" + backend = resolve_test_backend(repo_path=repo_path) + if backend.name != "python": + logger.info("Skipping Python venv setup for %s codegen", backend.display_name) + return + + try: + created_new, venv_path = ensure_dev_venv(repo_path) + if created_new: + logger.info("Created dev venv at %s", venv_path) + ensure_deps_installed(repo_path) + except Exception as exc: + logger.warning("Venv setup issue (non-fatal): %s", exc) + + # ============================================================================ # Module 1: Prompt Builder @@ -192,8 +209,8 @@ def _prepare_batch_context( ) -> Tuple[BatchExecutionState, Optional[Dict[str, Any]]]: """Build BatchExecutionState and dependency context for a task. - This mirrors the historical ``prepare_batch`` logic but returns data structures - instead of printing JSON. + Returns structured state rather than printing JSON, so the batch + runner can reuse the prepared dependency context directly. Returns: (batch_state, dependency_context) @@ -322,9 +339,11 @@ def run_single_attempt( if not agent_passed: result["failure_reason"] = agent_reason logger.info("Sub-agent self-reported FAIL: %s", agent_reason) - elif agent_summary is None: - # PASS without the required PYTEST_SUMMARY line is suspicious; - # log it so post_verify_failure analysis is easier. + elif agent_summary is None and not is_project_docs_batch(task): + # PASS without the required PYTEST_SUMMARY line is suspicious for a + # test-bearing task; log it so post_verify_failure analysis is easier. + # Docs/entry batches (README, requirements) run no tests and are + # post-verified by skip, so a missing summary is expected there. logger.warning( "Sub-agent reported PASS but did not provide PYTEST_SUMMARY line" ) @@ -427,7 +446,6 @@ def _refresh_dep_graph_safe( from rpg.service import RPGService rpg_path = REPO_RPG_FILE - dep_graph_path = DEP_GRAPH_FILE if not rpg_path.exists(): return @@ -435,20 +453,26 @@ def _refresh_dep_graph_safe( # ── Incremental path: codegen knows exactly which file changed ── if changed_files: - # Filter to .py only — sync_from_file_list assumes Python. - py_files = [f for f in changed_files if f.endswith(".py")] - if not py_files: - # No .py touched (e.g. only docs/config edits) — skip. - logger.info("dep_graph: no .py files in batch, skipping refresh") + # Keep only files lang_parser can build dep edges for. This spans + # every supported language (py/go/rs/ts/js/c/cpp), so non-Python + # projects keep an up-to-date dep_graph across batches too. + from lang_parser import is_supported_source + + source_files = [f for f in changed_files if is_supported_source(f)] + if not source_files: + # No analysable source touched (e.g. only docs/config edits). + logger.info("dep_graph: no supported source files in batch, skipping refresh") svc.save(str(rpg_path)) return + # ``save_path=None``: dep_graph rides inside rpg.json. The + # subsequent ``svc.save(rpg_path)`` embeds it. result = svc.sync_from_file_list( - file_paths=py_files, + file_paths=source_files, code_dir=str(repo_path), workspace_root=str(WORKSPACE_ROOT), - save_path=str(dep_graph_path), ) + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) logger.info( "dep_graph refreshed (mode=%s reason=%s): %d nodes, %d dep→rpg mappings", @@ -462,8 +486,8 @@ def _refresh_dep_graph_safe( svc.refresh_dep_graph( str(repo_path), workspace_root=str(WORKSPACE_ROOT), - save_path=str(dep_graph_path), ) + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) logger.info("dep_graph refreshed (full): %d nodes, %d dep→rpg mappings", len(svc.rpg.dep_graph.G.nodes()), @@ -607,15 +631,9 @@ def run_batch( logger.info("Branch: %s (initial_commit=%s)", branch_name, initial_commit[:8] if initial_commit else "none") - # ── Step 4: Setup venv ─────────────────────────────────────────── + # ── Step 4: Setup language environment ────────────────────────── - try: - created_new, venv_path = ensure_dev_venv(repo_path) - if created_new: - logger.info("Created dev venv at %s", venv_path) - ensure_deps_installed(repo_path) - except Exception as exc: - logger.warning("Venv setup issue (non-fatal): %s", exc) + _setup_codegen_environment(repo_path) # ── Step 5: Build prompts ──────────────────────────────────────── @@ -974,6 +992,8 @@ def main() -> int: help="Max units per merged batch (0 = no limit)") parser.add_argument("--agent-timeout", type=int, default=DEFAULT_AGENT_TIMEOUT, help=f"Sub-agent timeout in seconds (default: {DEFAULT_AGENT_TIMEOUT})") + parser.add_argument("--max-batches", type=int, default=0, + help="Stop --loop after this many batches (0 = no limit)") parser.add_argument("--review-iterations", type=int, default=10, help="Max iterations for global review (default: 10)") parser.add_argument("--json", action="store_true", help="Output as JSON") @@ -1088,11 +1108,20 @@ def _run_loop(args) -> int: total_passed = 0 total_failed = 0 start_time = time.time() + max_batches = max(0, int(args.max_batches or 0)) print("\n [START] Starting batch loop (Ctrl+C to stop after current batch)\n") try: while True: + if max_batches and batch_num >= max_batches: + elapsed = time.time() - start_time + print(f"\n [STOP] Reached max-batches={max_batches} " + f"({total_passed} passed, {total_failed} failed, " + f"{elapsed/60:.1f} min)") + logger.info("Loop stopped after max-batches=%d", max_batches) + return 0 if total_failed == 0 else 1 + batch_num += 1 result = run_batch( diff --git a/CoderMind/scripts/skeleton/file_designer.py b/CoderMind/scripts/skeleton/file_designer.py index 353a6ed..f30006a 100644 --- a/CoderMind/scripts/skeleton/file_designer.py +++ b/CoderMind/scripts/skeleton/file_designer.py @@ -31,6 +31,17 @@ from common import LLMClient from common.utils import get_project_background_context +# Skeleton design resolves a language backend from the project target +# language so file extensions, package markers, and prompt directives +# live with the rest of per-language decoder behaviour. Python projects +# receive an empty prompt directive; non-Python projects get a compact +# language preamble before skeleton prompts are rendered. +from decoder_lang import ( + get_backend, + resolve_decoder_language, + with_language_directive, +) + # ============================================================================ # Validation Functions @@ -38,44 +49,59 @@ def validate_directory_structure( dir_assignments: Dict[str, str], - required_components: List[str] + required_components: List[str], + backend: Optional[Any] = None, ) -> Tuple[bool, str]: """Validate that all required components have directory assignments. - + Args: dir_assignments: Mapping of component_name -> directory_path required_components: List of component names that must be covered - + backend: Optional :class:`decoder_lang.LanguageBackend`. When + supplied, each path segment is validated against the + backend's :meth:`is_valid_module_identifier`. When + ``None``, path segments must be valid Python identifiers. + Returns: (is_valid, error_message) """ errors = [] assigned_components = set(dir_assignments.keys()) required_set = set(required_components) - + # Check for missing components missing = required_set - assigned_components if missing: errors.append(f"Missing directory assignments for components: {sorted(missing)}") - + # Check for extra/unrecognized components extra = assigned_components - required_set if extra: errors.append(f"Unrecognized components in assignments: {sorted(extra)}") - - # Check for empty directory paths and Python identifier validity + + # Identifier validation falls back to Python rules when no backend + # is supplied. + if backend is None: + def _is_valid_segment(seg: str) -> bool: + return bool(seg) and seg.isidentifier() + identifier_kind = "Python identifier" + else: + _is_valid_segment = backend.is_valid_module_identifier + identifier_kind = f"{backend.display_name} identifier" + for comp, dir_path in dir_assignments.items(): if not dir_path or not dir_path.strip(): errors.append(f"Component '{comp}' has empty directory path") continue - # Each path segment used as a Python package must be a valid identifier + # Each path segment used as a package name must be a valid + # identifier for the target language. for segment in dir_path.replace("\\", "/").strip("/").split("/"): - if segment and not segment.isidentifier(): + if segment and not _is_valid_segment(segment): errors.append( f"Component '{comp}': directory segment '{segment}' is not a valid " - f"Python identifier (avoid hyphens; use underscores instead)" + f"{identifier_kind} (avoid hyphens; use underscores instead)" ) - + if errors: return False, "\n".join(errors) return True, "All components have valid directory assignments." @@ -158,7 +184,8 @@ def __init__( max_iterations: int = 10, config: Optional[Dict[str, Any]] = None, trajectory: Optional[Any] = None, - step_id: Optional[str] = None + step_id: Optional[str] = None, + target_language: Optional[str] = None, ): """Initialize FileDesigner. @@ -169,6 +196,12 @@ def __init__( config: Optional configuration dictionary trajectory: Optional trajectory tracker for logging steps step_id: Optional step ID for trajectory tracking + target_language: Optional explicit target language + (e.g. ``"python"``, ``"go"``). When ``None`` the + effective language is resolved from RPG root meta with + fallback to ``"python"``. The resolved backend provides + file-extension, package-marker, and prompt-directive + behaviour for skeleton generation. """ self.rpg = rpg self.llm_client = llm_client or LLMClient(trajectory=trajectory, step_id=step_id) @@ -179,6 +212,29 @@ def __init__( self.logger = logging.getLogger(__name__) + # Build a minimal RPG-shaped dict so language resolution does + # not trigger full graph serialization. + rpg_meta_lang = None + repo_node = getattr(self.rpg, "repo_node", None) + if repo_node is not None and getattr(repo_node, "meta", None) is not None: + rpg_meta_lang = getattr(repo_node.meta, "language", None) + rpg_dict_minimal = {"root": {"meta": {"language": rpg_meta_lang}}} + feature_spec_stub = ( + { + "meta": { + "primary_language": target_language, + "target_languages": [target_language], + } + } + if target_language + else None + ) + self.target_language = resolve_decoder_language( + feature_spec=feature_spec_stub, + rpg_obj=rpg_dict_minimal, + ) + self.backend = get_backend(self.target_language) + # Load project background / technology context (empty string if unavailable) try: self._project_background = get_project_background_context() @@ -327,13 +383,13 @@ def _generate_directory_structure( tech_section = ( f"\n{self._project_background}\n" "When a specific technology stack is described above, design the directory\n" - "structure to accommodate framework-specific conventions (e.g., `templates/`\n" - "for Jinja2, `models.py` for ORM, `app.py` for Flask entry point).\n" + "structure to accommodate the target language and framework conventions.\n" ) - # Sanitize repo name for use as a Python package directory - # (e.g., "blog-system" -> "blog_system") - safe_repo_name = self.rpg.repo_name.replace("-", "_") + hints = self.backend.prompt_hints() + safe_repo_name = self.backend.sanitize_module_identifier( + self.rpg.repo_name.replace(" ", "_") + ) base_user_prompt = f"""## Repository Information {repo_info} @@ -344,7 +400,9 @@ def _generate_directory_structure( ## Task Assign each component to an appropriate directory path. Use "{safe_repo_name}" as the project name in paths (e.g., src/{safe_repo_name}/...). -IMPORTANT: Directory names MUST be valid Python identifiers (use underscores, not hyphens). +IMPORTANT: {hints.module_naming_rule} +Target layout example: +{hints.package_layout_example} IMPORTANT: You MUST assign ALL {len(required_components)} components: {', '.join(required_components)} """ @@ -360,7 +418,9 @@ def _generate_directory_structure( # Call LLM _, result, _ = self.llm_client.call_structured( - system_prompt=RAW_SKELETON_PROMPT, + system_prompt=with_language_directive( + RAW_SKELETON_PROMPT, self.backend, + ), user_prompt=user_prompt, response_model=DirectoryStructureOutput, purpose=f"directory_structure_{attempt + 1}" @@ -378,8 +438,12 @@ def _generate_directory_structure( for assignment in result.assignments: component_to_dir[assignment.component_name] = assignment.directory_path - # Validate completeness - is_valid, error_msg = validate_directory_structure(component_to_dir, required_components) + # Validate completeness (identifier rules come from the + # resolved backend so Go segments are checked against Go + # naming rules, not Python's). + is_valid, error_msg = validate_directory_structure( + component_to_dir, required_components, backend=self.backend, + ) if is_valid: self.logger.info("\n Directory Structure (validated):") @@ -437,6 +501,8 @@ def _assign_features_to_files( if self._project_background and self._project_background.strip(): tech_section = f"\n{self._project_background}\n" + hints = self.backend.prompt_hints() + user_prompt = f"""## Repository Information {repo_info} {tech_section} @@ -448,13 +514,16 @@ def _assign_features_to_files( {feature_list} ## Task -Assign ALL the above features to Python files under {comp_dir}/. +Assign ALL the above features to {hints.display_name} source files under {comp_dir}/. +Source files should use the {hints.file_extension} extension. Every feature MUST be assigned to exactly one file. """ # Call LLM for feature assignment _, result, _ = self.llm_client.call_structured( - system_prompt=GROUP_SKELETON_PROMPT, + system_prompt=with_language_directive( + GROUP_SKELETON_PROMPT, self.backend, + ), user_prompt=user_prompt, response_model=FileAssignmentOutput, purpose=f"feature_assignment_{comp_name}" @@ -509,8 +578,10 @@ def _assign_features_to_files( # Check for unassigned features unassigned = [f for f in features if f not in assigned_features] if unassigned: - # Create fallback file for unassigned features - fallback_file = f"{comp_dir}/misc.py" + # Create fallback file for unassigned features. Extension + # comes from the resolved language backend so a Go run + # produces ``misc.go`` instead of ``misc.py``. + fallback_file = f"{comp_dir}/misc{self.backend.file_extension}" comp_assignments.append({ "file_path": fallback_file, "features": unassigned, @@ -548,10 +619,11 @@ def _build_final_skeleton(self, file_assignments: List[Dict[str, Any]]): ) self.stats["files_created"] += 1 - # Add __init__.py files to all directories - init_files_added = self.skeleton.add_init_files() + # Add package-marker files to all directories (Python: + # ``__init__.py``; Go / Rust / TS: no-op via backend). + init_files_added = self.skeleton.add_init_files(backend=self.backend) self.stats["init_files_created"] = init_files_added - self.logger.info(f"Added {init_files_added} __init__.py files") + self.logger.info(f"Added {init_files_added} package marker files") self.logger.info(f"Created skeleton with {len(self.skeleton.path_to_node)} total nodes") @@ -628,6 +700,8 @@ def patch( if self._project_background and self._project_background.strip(): tech_section = f"\n{self._project_background}\n" + hints = self.backend.prompt_hints() + user_prompt = f"""## Repository Information {repo_info} {tech_section} @@ -638,13 +712,16 @@ def patch( {feature_list} ## Task -Assign ALL the above features to Python files under {comp_dir}/. +Assign ALL the above features to {hints.display_name} source files under {comp_dir}/. +Source files should use the {hints.file_extension} extension. Every feature MUST be assigned to exactly one file. You may add features to existing files in this directory or create new files. """ _, result, _ = self.llm_client.call_structured( - system_prompt=GROUP_SKELETON_PROMPT, + system_prompt=with_language_directive( + GROUP_SKELETON_PROMPT, self.backend, + ), user_prompt=user_prompt, response_model=FileAssignmentOutput, purpose=f"patch_feature_assignment_{comp_name}" @@ -654,7 +731,7 @@ def patch( if not result: self.logger.error(f"Patch assignment failed for component: {comp_name}") - fallback_file = f"{comp_dir}/misc.py" + fallback_file = f"{comp_dir}/misc{self.backend.file_extension}" all_assignments.append({ "file_path": fallback_file, "features": missing_features, @@ -694,7 +771,7 @@ def patch( unassigned = [f for f in missing_features if f not in assigned_features] if unassigned: - fallback_file = f"{comp_dir}/misc.py" + fallback_file = f"{comp_dir}/misc{self.backend.file_extension}" comp_assignments.append({ "file_path": fallback_file, "features": unassigned, diff --git a/CoderMind/scripts/skeleton/skeleton_models.py b/CoderMind/scripts/skeleton/skeleton_models.py index e09d07a..0d34aee 100644 --- a/CoderMind/scripts/skeleton/skeleton_models.py +++ b/CoderMind/scripts/skeleton/skeleton_models.py @@ -367,19 +367,50 @@ def load_json(cls, filepath: str) -> "RepoSkeleton": data = json.load(f) return cls.from_dict(data) - def add_init_files(self, skip_root: bool = True, docstring_template: Optional[str] = None) -> int: - """Add __init__.py files to all directories in the skeleton. - - This ensures that all directories are proper Python packages. + def add_init_files( + self, + skip_root: bool = True, + docstring_template: Optional[str] = None, + backend: Optional[Any] = None, + ) -> int: + """Add package-marker files to all directories in the skeleton. + + When ``backend`` is supplied, the file name, content, and + per-directory "has source" predicate are sourced from the + backend. Backends whose :meth:`package_marker_filename` returns + ``None`` (Go, Rust, TypeScript, …) make this method a no-op + because directories without marker files are the language + convention. Args: - skip_root: Whether to skip adding __init__.py to root directory. - docstring_template: Optional docstring template. - Use {name} for directory name, {path} for directory path. + skip_root: Whether to skip adding the marker to the root. + docstring_template: Optional template (``{name}`` / + ``{path}``). Used only when the backend's + :meth:`package_marker_content` returns None (i.e. the + caller wants the built-in marker body). + backend: Optional :class:`decoder_lang.LanguageBackend`. + When ``None``, Python package-marker rules are used. Returns: - Number of __init__.py files added. + Number of marker files added (0 for languages that don't + use a marker file). """ + # When no backend is supplied, use Python package-marker rules. + if backend is None: + marker_filename: Optional[str] = "__init__.py" + source_extension: str = ".py" + else: + marker_filename = backend.package_marker_filename() + source_extension = backend.file_extension + + # Languages without a package marker (Go / Rust / TS) → no-op. + if marker_filename is None: + logging.debug( + "add_init_files: backend %s has no package marker; skipping", + getattr(backend, "name", "?"), + ) + return 0 + init_files_added = 0 # Get all directory nodes @@ -390,49 +421,61 @@ def add_init_files(self, skip_root: bool = True, docstring_template: Optional[st if skip_root and (dir_node.path == "." or dir_node == self.root): continue - # Skip non-Python directories (like docs, assets, etc.) - # Only add __init__.py to directories that contain Python files or subdirectories - has_python_content = False + # Skip directories that contain no source files in this + # language (mirrors the original heuristic, just + # parameterised). Sub-directories still count so that an + # empty package-only directory tree still gets markers + # placed correctly. + has_source_content = False for child in dir_node.children(): - if isinstance(child, FileNode) and child.name.endswith('.py'): - has_python_content = True + if isinstance(child, FileNode) and child.name.endswith(source_extension): + has_source_content = True break if isinstance(child, DirectoryNode): - has_python_content = True + has_source_content = True break - - # Also add if the directory is under a common Python package pattern + + # Also add if the directory is under a common Python package + # path. Non-Python backends opt out earlier via + # ``marker_filename is None``. is_python_pkg_path = any( - dir_node.path.startswith(prefix) + dir_node.path.startswith(prefix) for prefix in ['src/', 'lib/', 'pkg/', 'packages/'] ) or '/src/' in dir_node.path - if not has_python_content and not is_python_pkg_path: + if not has_source_content and not is_python_pkg_path: continue - # Build __init__.py path - init_path = normalize_path(os.path.join(dir_node.path, "__init__.py")) + # Build marker file path + init_path = normalize_path(os.path.join(dir_node.path, marker_filename)) - # Skip if __init__.py already exists + # Skip if marker already exists if init_path in self.path_to_node: continue - # Generate content for __init__.py - if docstring_template: + # Generate content for the marker file + if backend is not None: + content = backend.package_marker_content(dir_node.path) + # Backends that return None for content but emit a + # marker (rare; not used today) still need *some* body. + if content is None: + content = "" + code = content + elif docstring_template: code = docstring_template.format( name=dir_node.name, - path=dir_node.path + path=dir_node.path, ) else: - # Default minimal docstring + # Default minimal marker docstring. code = f'"""Package: {dir_node.name}"""\n' - # Create __init__.py file node + # Create marker file node init_node = FileNode( - name="__init__.py", + name=marker_filename, path=init_path, code=code, - feature_paths=[] + feature_paths=[], ) # Add to directory and path registry @@ -440,9 +483,9 @@ def add_init_files(self, skip_root: bool = True, docstring_template: Optional[st self.path_to_node[init_path] = init_node init_files_added += 1 - logging.debug(f"Added __init__.py to: {dir_node.path}") + logging.debug(f"Added {marker_filename} to: {dir_node.path}") - logging.info(f"Added {init_files_added} __init__.py files to skeleton") + logging.info(f"Added {init_files_added} {marker_filename} files to skeleton") return init_files_added def get_statistics(self) -> Dict[str, Any]: diff --git a/CoderMind/scripts/skeleton/skeleton_prompts.py b/CoderMind/scripts/skeleton/skeleton_prompts.py index 565dd14..dffe6a6 100644 --- a/CoderMind/scripts/skeleton/skeleton_prompts.py +++ b/CoderMind/scripts/skeleton/skeleton_prompts.py @@ -22,7 +22,7 @@ ## Requirements 1. The structure must clearly separate each functional component and reflect logical domain boundaries. -2. Folder names must be concise, meaningful, and follow Python naming conventions (snake_case). +2. Folder names must be concise, meaningful, and follow the target language's naming conventions. 3. Component names serve as functional descriptions, not required folder names. - Rename folders as needed for clarity and readability. - Include a mapping from folder names to the original component names. @@ -64,19 +64,19 @@ # Group Skeleton Generation Prompt # ============================================================================ -GROUP_SKELETON_PROMPT = """You are a repository architect responsible for incrementally assigning features from a functional component into a production-grade Python repository structure. +GROUP_SKELETON_PROMPT = """You are a repository architect responsible for incrementally assigning features from a functional component into a production-grade target-language repository structure. -Your primary goals are clarity, modularity, and long-term maintainability. The resulting layout should resemble a modern, well-structured Python library rather than a direct projection of the feature tree. +Your primary goals are clarity, modularity, and long-term maintainability. The resulting layout should resemble a modern, well-structured repository in the target language rather than a direct projection of the feature tree. You may: - Group related features into shared modules, - Introduce or adjust folders when semantically appropriate, - Refine or reorganize previous design decisions as needed. -Your task is to assign each feature to a `.py` file path that: +Your task is to assign each feature to a target-language source file path that: - Begins with the designated folder, - Groups semantically related features together (even if they originate from different branches of the feature tree), -- Reflects realistic Python module organization, +- Reflects realistic target-language module/package organization, - Uses folders where helpful to express higher-level structure. ## Rules @@ -93,12 +93,12 @@ - When a folder becomes crowded, introduce semantically meaningful subfolders rather than scattering features into many tiny modules. ### Naming and Organization Guidelines -1. Use clear, concise, semantically meaningful names in `snake_case`. Each file or folder should represent a well-scoped functional area. -2. Names should reflect functional purpose without redundancy. Avoid repeating folder context in filenames when it is obvious (for example, inside `auth/`, prefer `token.py` over `auth_token.py`). -3. Avoid vague or purely placeholder names such as `module_part1.py` or `other_module.py`. +1. Use clear, concise, semantically meaningful names that follow target-language conventions. Each file or folder should represent a well-scoped functional area. +2. Names should reflect functional purpose without redundancy. Avoid repeating folder context in filenames when it is obvious. +3. Avoid vague or purely placeholder names such as `module_part1` or `other_module`. 4. Utility-style modules are allowed when they are clearly scoped. Examples: - - `vector_utils.py`, `io_utils.py`, or `text_utils.py` inside appropriately named folders, - - `util.py` or `utils.py` within a well-defined domain folder, where the utility code is narrowly focused on that domain. + - `vector_utils`, `io_utils`, or `text_utils` (with the target language's file extension) inside appropriately named folders, + - a `util`/`utils` module within a well-defined domain folder, where the utility code is narrowly focused on that domain. These should not become unbounded catch-all modules. 5. It is acceptable to place features originating from multiple original subtrees into the same file if they form a coherent functional unit in the repository architecture. @@ -113,7 +113,7 @@ { "assignments": [ { - "file_path": "src/project/component/module.py", + "file_path": "src/project/component/module.ext", "features": ["feature1", "feature2"], "purpose": "Brief description of file purpose" } @@ -166,7 +166,7 @@ - Do not add new fields or categories beyond the four listed. """ -GROUP_SKELETON_REVIEW_PROMPT = """You are a senior software architect reviewing the feature-to-file assignments proposed by an architecture assistant. Your role is to critically evaluate the structural quality of the resulting Python module layout across the five criteria below. +GROUP_SKELETON_REVIEW_PROMPT = """You are a senior software architect reviewing the feature-to-file assignments proposed by an architecture assistant. Your role is to critically evaluate the structural quality of the resulting target-language module layout across the five criteria below. ## Review Criteria ### 1. File Scope Appropriateness @@ -179,14 +179,14 @@ - The folder hierarchy should reflect clean separations of concern and meaningful domain boundaries. - Introduce subfolders when a directory becomes crowded or mixes distinct types of functionality. - Avoid excessively flat or deeply nested layouts. -- Detect filename clusters with shared prefixes and organize them into subfolders; avoid redundant naming (e.g., `nlp/nlp_tokenizer.py`). +- Detect filename clusters with shared prefixes and organize them into subfolders; avoid redundant naming (e.g., a `nlp/` folder whose files repeat the `nlp_` prefix). ### 3. Modularity & Cohesion - Modules should exhibit strong internal cohesion and minimal coupling. - Each module should map to a single clear abstraction. - Flag mixed-purpose, catch-all, or poorly scoped modules for redesign. ### 4. Naming Quality -- Names must be clear, concise, meaningful, and consistently in `snake_case`. +- Names must be clear, concise, meaningful, and consistent with target-language naming conventions. - Avoid redundancy between folder and file names. - Reject vague, generic, placeholder, or suffix-based names. - Prefer succinct, expressive names that accurately reflect functionality. @@ -198,7 +198,7 @@ ### Special Emphasis - Apply strict scrutiny to both naming and structural decisions. -- Placeholder or incremental naming patterns (`_a.py`, `_b.py`, `_c.py`) must be rejected. +- Placeholder or incremental naming patterns must be rejected. - When flagging an issue, always recommend specific, meaningful alternatives. ## Output Format diff --git a/CoderMind/scripts/smoke_test.py b/CoderMind/scripts/smoke_test.py index 35cc259..71e9c4d 100644 --- a/CoderMind/scripts/smoke_test.py +++ b/CoderMind/scripts/smoke_test.py @@ -38,6 +38,40 @@ logger = logging.getLogger(__name__) + +def _resolve_backend(repo_path: Path): + """Resolve the target-language backend for ``repo_path``. + + Reads explicit language metadata from the repo's ``.cmind/data`` + artefacts (feature_spec / rpg, written by the encoder / decoder) and + falls back to scanning the real source files on disk, so the smoke + test detects the right language even when that metadata is missing or + unreadable. Degrades to Python only for a genuinely empty / unknown + repo. Never raises. + """ + try: + from decoder_lang import resolve_repo_backend + except Exception: # noqa: BLE001 + return None + + def _load(rel: str): + try: + artefact = repo_path / ".cmind" / "data" / rel + if artefact.is_file(): + return json.loads(artefact.read_text(encoding="utf-8")) + except Exception: # noqa: BLE001 + return None + return None + + try: + return resolve_repo_backend( + repo_path, + feature_spec=_load("feature_spec.json"), + rpg_obj=_load("rpg.json"), + ) + except Exception: # noqa: BLE001 + return None + # ============================================================================ # Data Classes # ============================================================================ @@ -216,21 +250,99 @@ def check_imports(repo_path: Path, result: SmokeResult) -> Dict[str, Any]: # Layer 2: Entry Point Validation # ============================================================================ +def _locate_existing_entry(repo_path: Path, backend: Any) -> Optional[str]: + """Return an existing entry file matching the backend's accepted shapes. + + ``entry_point_candidates`` may contain ``*`` globs (Go's + ``cmd/*/main.go``). The canonical ``entry_point_path`` slug often differs + from the one the skeleton chose, so probing the accepted shapes locates a + real entry the canonical path would miss. Returns the first existing + repo-relative POSIX match, or None when no candidate resolves to a file. + """ + try: + candidates = backend.entry_point_candidates() + except Exception: # noqa: BLE001 + return None + for pattern in candidates: + if any(ch in pattern for ch in "*?["): + for match in sorted(repo_path.glob(pattern)): + if match.is_file(): + return match.relative_to(repo_path).as_posix() + elif (repo_path / pattern).is_file(): + return pattern + return None + + def check_entry_point(repo_path: Path, result: SmokeResult) -> Dict[str, Any]: - """Verify main.py can start and --help works.""" + """Verify the project's entry point starts and ``--help`` works. + + Language-aware: the entry path and run command come from the target + backend (``main.py`` for Python, ``go run ./cmd/...`` for Go, etc.). + The command runs in a *clean* checkout — no ``PYTHONPATH`` / path + bridging is injected — so a project that imports its own package but + ships no install metadata (the src/-layout ``ModuleNotFoundError`` + case) is caught here rather than passing silently. + """ logger.info("Layer 2: Entry point check") - main_py = repo_path / "main.py" - python_exe = _get_python_exe(repo_path) + backend = _resolve_backend(repo_path) - if not main_py.exists(): - logger.info(" No main.py found, skipping") - return {"skipped": True, "reason": "no main.py"} + # Resolve entry path + run command from the backend. Fall back to the + # historical Python ``main.py --help`` when no backend is available. + entry_rel = None + run_cmd = None + if backend is not None: + try: + entry_rel = backend.entry_point_path("") + run_cmd = backend.entry_run_command(repo_path, entry_rel) + except Exception: # noqa: BLE001 + entry_rel, run_cmd = None, None + + if run_cmd is None and backend is not None and backend.name != "python": + # The canonical entry slug often differs from the one the skeleton + # actually chose (Go: canonical ``cmd/app/main.go`` vs generated + # ``cmd/todoapp/main.go``), so ``entry_run_command`` returns None for a + # repo that does ship a runnable entry. Probe the backend's accepted + # entry shapes (globs allowed) to locate the real entry before giving + # up, so it is actually validated instead of silently skipped. + located = _locate_existing_entry(repo_path, backend) + if located is not None: + entry_rel = located + try: + run_cmd = backend.entry_run_command(repo_path, located) + except Exception: # noqa: BLE001 + run_cmd = None + + if run_cmd is None and backend is not None and backend.name != "python": + # Compiled CLIs (C/C++) and toolchain-less hosts expose no run + # probe; treat as a non-fatal skip rather than a failure. + logger.info(" No run probe for %s project, skipping", backend.name) + return {"skipped": True, "reason": f"no run probe for {backend.name}"} + + if run_cmd is None: + main_py = repo_path / "main.py" + if not main_py.exists(): + logger.info(" No main.py found, skipping") + return {"skipped": True, "reason": "no main.py"} + python_exe = _get_python_exe(repo_path) + run_cmd = [python_exe, "main.py", "--help"] + entry_rel = "main.py" layer = {"exists": True, "help_works": False, "help_length": 0, "startup_error": None} - # Try --help (safe, exits immediately) + # Run the entry probe in a CLEAN subprocess: do NOT inject PYTHONPATH, + # so missing install metadata surfaces as a real startup failure. + def _run_clean(cmd: List[str], timeout: int = 30) -> subprocess.CompletedProcess: + env = os.environ.copy() + env["PYTHONDONTWRITEBYTECODE"] = "1" + env.pop("PYTHONPATH", None) + return subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + cwd=str(repo_path), env=env, + ) + + label = entry_rel or "entry point" try: - proc = _run_in_repo(repo_path, [python_exe, "main.py", "--help"], timeout=15) + proc = _run_clean(run_cmd, timeout=30) if proc.returncode == 0: layer["help_works"] = True layer["help_length"] = len(proc.stdout) @@ -238,22 +350,22 @@ def check_entry_point(repo_path: Path, result: SmokeResult) -> Dict[str, Any]: result.add_finding(SmokeFinding( layer="entry_point", severity="warning", check="help_too_short", - message=f"main.py --help output is only {len(proc.stdout)} chars (possible stub)", + message=f"{label} --help output is only {len(proc.stdout)} chars (possible stub)", )) else: layer["startup_error"] = proc.stderr.strip().splitlines()[-1] if proc.stderr.strip() else "nonzero exit" result.add_finding(SmokeFinding( layer="entry_point", severity="error", check="help_fails", - message=f"main.py --help failed: {layer['startup_error']}", + message=f"{label} entry probe failed: {layer['startup_error']}", details=proc.stderr[-1000:] if proc.stderr else "", )) except subprocess.TimeoutExpired: - layer["startup_error"] = "timed out (15s)" + layer["startup_error"] = "timed out (30s)" result.add_finding(SmokeFinding( layer="entry_point", severity="error", check="help_timeout", - message="main.py --help timed out (15s) — may hang on startup", + message=f"{label} entry probe timed out (30s) — may hang on startup", )) layer["passed"] = layer["help_works"] @@ -330,9 +442,20 @@ def run_smoke_test( result = SmokeResult() + # The import and stub layers parse Python with the stdlib ``ast`` and + # only glob ``*.py``; they are meaningless for other languages. Skip + # them for non-Python projects (the entry layer is language-aware via + # the backend and still runs). Default to Python when undetermined. + backend = _resolve_backend(repo_path) + result.project_type = backend.name if backend is not None else "python" + is_python = backend is None or backend.name == "python" + # Layer 1: Import completeness if "imports" in run_layers: - result.layers["imports"] = check_imports(repo_path, result) + if is_python: + result.layers["imports"] = check_imports(repo_path, result) + else: + result.layers["imports"] = {"skipped": True, "reason": f"{backend.name} (python-only layer)"} # Layer 2: Entry point if "entry" in run_layers: @@ -340,7 +463,10 @@ def run_smoke_test( # Layer 3: Stub/placeholder detection if "stubs" in run_layers: - result.layers["stubs"] = check_stubs(repo_path, result) + if is_python: + result.layers["stubs"] = check_stubs(repo_path, result) + else: + result.layers["stubs"] = {"skipped": True, "reason": f"{backend.name} (python-only layer)"} result.duration = time.time() - start return result diff --git a/CoderMind/scripts/tools/gui.py b/CoderMind/scripts/tools/gui.py index ed1093c..f175635 100644 --- a/CoderMind/scripts/tools/gui.py +++ b/CoderMind/scripts/tools/gui.py @@ -209,7 +209,7 @@ def cmd_stop_display(display: str = DEFAULT_DISPLAY): def cmd_launch(command: str, display: str = DEFAULT_DISPLAY, wait: float = LAUNCH_WAIT): """Launch a GUI application on the virtual display.""" - # Kill any previously launched app to avoid multiple instances + # Close the tracked app process before launching a replacement. prev_pid = _load_app_pid() if prev_pid is not None: print(f" Closing previous app (pid {prev_pid}) before re-launch") diff --git a/CoderMind/scripts/update_graphs.py b/CoderMind/scripts/update_graphs.py index 5978c7f..b9964b3 100644 --- a/CoderMind/scripts/update_graphs.py +++ b/CoderMind/scripts/update_graphs.py @@ -2,7 +2,7 @@ """Unified graph update tool — update dep_graph, feature graph, or both. Subcommands: - dep Rebuild dep_graph.json from AST (no RPG changes) + dep Rebuild the AST dependency graph and embed it in rpg.json enrich Enrich feature graph from actual code (align paths + fill missing) sync Full sync: dep + enrich + mappings update-rpg Full RPG update (dep_graph + feature graph via LLM) against @@ -26,13 +26,14 @@ import sys import time from pathlib import Path +from typing import Optional SCRIPTS_DIR = Path(__file__).resolve().parent if str(SCRIPTS_DIR) not in sys.path: sys.path.insert(0, str(SCRIPTS_DIR)) from common.paths import REPO_RPG_FILE, DEP_GRAPH_FILE, RPG_HTML_FILE, HOOK_CALLS_LOG # noqa: E402 -from common.rpg_io import safe_load_rpg # noqa: E402 +from common.rpg_io import atomic_write_rpg, safe_load_rpg # noqa: E402 # Shared message used by every subcommand that requires an existing @@ -114,8 +115,23 @@ def _refresh_rpg_html(rpg_path: Path) -> dict: return result -def update_dep_only(code_dir: str, workspace_root: str, dep_graph_path: Path) -> dict: - """Mode: dep — Only rebuild dep_graph.json from AST, no RPG changes.""" +def update_dep_only(code_dir: str, workspace_root: str, dep_graph_path: Path, + rpg_path: Optional[Path] = None) -> dict: + """Mode: dep — Rebuild dep_graph from AST and persist into rpg.json. + + In the embedded-dep_graph world the dep_graph lives inside + ``rpg.json`` (see ``RPG.to_dict(include_dep_graph=True)``). This + mode therefore reads the current ``rpg.json``, swaps in the freshly + rebuilt dep_graph, and writes ``rpg.json`` back out. When + ``rpg_path`` is ``None`` (or the file is missing) we fall back to + the legacy standalone ``dep_graph.json`` write so that environments + which haven't run the encoder yet still get a useful artefact — + this is the path the very-first pre-commit hook hits on a fresh + workspace before any RPG exists. + + ``dep_graph_path`` is preserved as a parameter for CLI back-compat + but is now used only in the legacy fallback path. + """ from rpg.dep_graph import DependencyGraph t0 = time.time() @@ -123,17 +139,38 @@ def update_dep_only(code_dir: str, workspace_root: str, dep_graph_path: Path) -> dg.build() dg.parse() - # Save with metadata wrapper. ``relpath`` returns ``"."`` when - # ``code_dir == workspace_root`` (workspace == repo, the common case); - # normalise to ``""`` so consumers can use a plain truthy check. - raw = dg.to_dict() _rel = os.path.relpath(code_dir, workspace_root) - raw["code_dir"] = "" if _rel == "." else _rel + code_dir_rel = "" if _rel == "." else _rel + + # Preferred path: dep_graph rides inside rpg.json (single source of truth). + if rpg_path is not None and rpg_path.is_file(): + from rpg.service import RPGService + svc = RPGService.load(str(rpg_path)) + svc.rpg.dep_graph = dg + svc.rpg._dep_graph_code_dir = code_dir_rel + svc.rpg._dep_to_rpg_map = svc.rpg._build_dep_to_rpg_map() + svc.rpg.rebuild_cross_maps() + # Drop the legacy external pointer so RPGService.load doesn't + # override the embedded dep_graph on the next read. + svc.rpg._dep_graph_file = None + svc.save(str(rpg_path)) + + return { + "mode": "dep", + "dep_nodes": len(dg.G.nodes()), + "dep_edges": len(dg.G.edges()), + "rpg_path": str(rpg_path), + "duration": round(time.time() - t0, 3), + } + + # Legacy fallback: write standalone dep_graph.json for environments + # without an rpg.json yet (rare in practice — the pre-commit hook + # exits early on workspaces that never ran the encoder). + raw = dg.to_dict() + raw["code_dir"] = code_dir_rel from datetime import datetime, timezone raw["generated_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S") - - with open(str(dep_graph_path), "w", encoding="utf-8") as f: - json.dump(raw, f, ensure_ascii=False, indent=2) + atomic_write_rpg(str(dep_graph_path), raw, ensure_ascii=False, indent=2) return { "mode": "dep", @@ -146,13 +183,15 @@ def update_dep_only(code_dir: str, workspace_root: str, dep_graph_path: Path) -> def update_mapping(rpg_path: Path, code_dir: str, workspace_root: str, dep_graph_path: Path) -> dict: - """Mode: mapping — Rebuild dep_graph + dep↔rpg mappings + save both.""" + """Mode: mapping — Rebuild dep_graph + dep↔rpg mappings, persist into rpg.json.""" from rpg.service import RPGService t0 = time.time() svc = RPGService.load(str(rpg_path)) - svc.refresh_dep_graph(code_dir, workspace_root=workspace_root, - save_path=str(dep_graph_path)) + # ``save_path=None``: dep_graph rides inside rpg.json (no standalone file) + svc.refresh_dep_graph(code_dir, workspace_root=workspace_root) + # Drop any stale external pointer left by older runs. + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) return { @@ -163,32 +202,48 @@ def update_mapping(rpg_path: Path, code_dir: str, workspace_root: str, "feature_to_dep": len(svc.rpg._feature_to_dep_map), "rpg_nodes": len(svc.rpg._node_index), "rpg_edges": len(svc.rpg.edges), - "dep_graph_path": str(dep_graph_path), "rpg_path": str(rpg_path), "duration": round(time.time() - t0, 3), } def update_feature(rpg_path: Path, dep_graph_path: Path) -> dict: - """Mode: feature — Load existing dep_graph, rebuild mappings + edges only.""" + """Mode: feature — Load existing dep_graph, rebuild mappings + edges only. + + Reads dep_graph from rpg.json's embedded copy (the new contract); only + falls back to the standalone ``dep_graph.json`` for legacy workspaces + that haven't been re-encoded since the embed migration. + """ from rpg.service import RPGService from rpg.models import RPG t0 = time.time() svc = RPGService.load(str(rpg_path)) - if not dep_graph_path.exists(): - return {"mode": "feature", "error": f"dep_graph.json not found: {dep_graph_path}"} - - # Load dep_graph without re-scanning AST - dg = RPG.load_dep_graph(dep_graph_path) - svc.rpg.dep_graph = dg + # Prefer the embedded dep_graph that RPGService.load already + # attached. Only touch the standalone file when the embedded copy + # is absent (legacy on-disk rpg.json from before the embed + # migration). + if svc.rpg.dep_graph is None: + if not dep_graph_path.exists(): + return { + "mode": "feature", + "error": ( + f"rpg.json has no embedded dep_graph and no standalone " + f"dep_graph.json found at {dep_graph_path}. " + "Run `cmind script update_graphs.py sync` to rebuild it." + ), + } + # Legacy compat path + dg = RPG.load_dep_graph(dep_graph_path) + svc.rpg.dep_graph = dg # Rebuild mappings svc.rpg._dep_to_rpg_map = svc.rpg._build_dep_to_rpg_map() svc.rpg.rebuild_cross_maps() # Save RPG (edges will be merged from dep_graph via to_dict) + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) return { @@ -203,21 +258,22 @@ def update_feature(rpg_path: Path, dep_graph_path: Path) -> dict: def update_full(rpg_path: Path, code_dir: str, workspace_root: str, dep_graph_path: Path) -> dict: - """Mode: full — AST scan + mappings + edges + save everything.""" + """Mode: full — AST scan + mappings + edges, persist into rpg.json.""" from rpg.service import RPGService t0 = time.time() svc = RPGService.load(str(rpg_path)) - # Rebuild dep_graph from code - svc.refresh_dep_graph(code_dir, workspace_root=workspace_root, - save_path=str(dep_graph_path)) + # Rebuild dep_graph from code; ``save_path=None`` so dep_graph rides + # inside rpg.json only. + svc.refresh_dep_graph(code_dir, workspace_root=workspace_root) # Count dep_graph semantic edges that will merge into RPG edges dep_semantic_edges = [ e for e in svc.rpg.get_dep_edges_for_rpg() ] + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) return { @@ -229,7 +285,6 @@ def update_full(rpg_path: Path, code_dir: str, workspace_root: str, "dep_semantic_edges_merged": len(dep_semantic_edges), "rpg_nodes": len(svc.rpg._node_index), "rpg_edges": len(svc.rpg.edges), - "dep_graph_path": str(dep_graph_path), "rpg_path": str(rpg_path), "duration": round(time.time() - t0, 3), } @@ -244,9 +299,8 @@ def cmd_enrich(rpg_path: Path, code_dir: str, workspace_root: str, t0 = time.time() svc = RPGService.load(str(rpg_path)) - # Rebuild dep_graph first for accuracy - svc.refresh_dep_graph(code_dir, workspace_root=workspace_root, - save_path=str(dep_graph_path)) + # Rebuild dep_graph first for accuracy (embedded only — single source). + svc.refresh_dep_graph(code_dir, workspace_root=workspace_root) # Run enrichment (skip_dep_rebuild since refresh_dep_graph already did it) enrich_result = svc.enrich_from_code( @@ -258,13 +312,13 @@ def cmd_enrich(rpg_path: Path, code_dir: str, workspace_root: str, ) if not dry_run: + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) enrich_result.update({ "mode": "enrich", "dry_run": dry_run, "rpg_path": str(rpg_path), - "dep_graph_path": str(dep_graph_path), "duration": round(time.time() - t0, 3), }) return enrich_result @@ -319,10 +373,11 @@ def cmd_sync( svc = RPGService.load(str(rpg_path)) + # ``save_path=None``: dep_graph rides inside rpg.json (single source). + # The caller's ``svc.save(rpg_path)`` below embeds it. sync_result = svc.sync_from_commit_diff( code_dir=code_dir, workspace_root=workspace_root, - save_path=str(dep_graph_path), file_limit=file_limit, staged_only=staged_only, force_full=force_full, @@ -335,6 +390,7 @@ def cmd_sync( if sync_result.get("mode") != "noop": enrich_result = svc.enrich_from_code(code_dir, skip_dep_rebuild=True) + svc.rpg._dep_graph_file = None svc.save(str(rpg_path)) # Keep ``rpg.html`` aligned with the freshly-saved ``rpg.json``. @@ -366,7 +422,6 @@ def cmd_sync( "filled": enrich_result.get("filled", 0), "groups_created": enrich_result.get("groups_created", 0), "rpg_nodes": len(svc.rpg._node_index), - "dep_graph_path": str(dep_graph_path), "rpg_path": str(rpg_path), "viz_path": viz_result.get("viz_path"), "viz_error": viz_result.get("viz_error"), @@ -524,9 +579,17 @@ def cmd_status(rpg_path: Path, dep_graph_path: Path) -> dict: "rpg_path": str(rpg_path), "dep_graph_path": str(dep_graph_path), "rpg_exists": rpg_path.exists(), - "dep_graph_exists": dep_graph_path.exists(), + "legacy_dep_graph_exists": dep_graph_path.exists(), + "dep_graph_exists": False, + "dep_graph_source": "none", } + def _count_graph(graph_data: dict) -> None: + nodes = graph_data.get("nodes") or [] + edges = graph_data.get("edges") or [] + status["dep_nodes"] = len(nodes) if isinstance(nodes, (list, dict)) else 0 + status["dep_edges"] = len(edges) if isinstance(edges, (list, dict)) else 0 + if rpg_path.exists(): try: # Use safe_load_rpg so a corrupted rpg.json doesn't crash @@ -561,6 +624,13 @@ def _walk(node): status["last_synced_short"] = git_meta.get("head_short") status["last_synced_branch"] = git_meta.get("head_branch") status["last_synced_at"] = git_meta.get("head_timestamp") + embedded_dep = rpg_data.get("dep_graph") + if isinstance(embedded_dep, dict) and ( + embedded_dep.get("nodes") or embedded_dep.get("edges") + ): + status["dep_graph_exists"] = True + status["dep_graph_source"] = "embedded" + _count_graph(embedded_dep) except (OSError, json.JSONDecodeError) as exc: status["rpg_error"] = str(exc) @@ -581,12 +651,13 @@ def _walk(node): if last and current_head.get("head_commit"): status["rpg_in_sync_with_head"] = last == current_head["head_commit"] - if dep_graph_path.exists(): + if status["dep_graph_source"] == "none" and dep_graph_path.exists(): try: with open(dep_graph_path, "r", encoding="utf-8") as f: dg_data = json.load(f) - status["dep_nodes"] = len(dg_data.get("nodes") or []) - status["dep_edges"] = len(dg_data.get("edges") or []) + status["dep_graph_exists"] = True + status["dep_graph_source"] = "legacy_file" + _count_graph(dg_data) status["dep_generated_at"] = dg_data.get("generated_at") except (OSError, json.JSONDecodeError) as exc: status["dep_graph_error"] = str(exc) @@ -729,13 +800,16 @@ def _add_common(p): p.add_argument("--rpg", type=Path, default=REPO_RPG_FILE, help="Path to RPG file (repo_rpg.json)") p.add_argument("--dep-graph", type=Path, default=DEP_GRAPH_FILE, - help="Path to dep_graph.json") + help=( + "Legacy standalone dep_graph path used only " + "when rpg.json has no embedded dep_graph" + )) p.add_argument("--code-dir", type=str, default=None, help="Code directory (default: auto-detect)") p.add_argument("--json", action="store_true", help="JSON output") # dep - p_dep = sub.add_parser("dep", help="Rebuild dep_graph.json from AST") + p_dep = sub.add_parser("dep", help="Rebuild dep_graph from AST into rpg.json") _add_common(p_dep) # enrich @@ -852,7 +926,13 @@ def _add_common(p): # Dispatch if command == "dep": - result = update_dep_only(code_dir, workspace_root, args.dep_graph) + # ``rpg_path`` is preferred (embedded dep_graph); falls back to + # writing a legacy standalone dep_graph when the workspace has no + # rpg.json yet (very first commit before /cmind.encode). + result = update_dep_only( + code_dir, workspace_root, args.dep_graph, + rpg_path=args.rpg, + ) elif command == "mapping": result = update_mapping(args.rpg, code_dir, workspace_root, args.dep_graph) elif command == "feature": diff --git a/CoderMind/src/cmind_cli/__init__.py b/CoderMind/src/cmind_cli/__init__.py index ce79ac0..753855a 100644 --- a/CoderMind/src/cmind_cli/__init__.py +++ b/CoderMind/src/cmind_cli/__init__.py @@ -1793,6 +1793,8 @@ def _workspace_has_python_code(project_path: Path) -> bool: _ENCODE_RE_TOTAL_FILES = re.compile(r"Total valid Python files to parse:\s*(\d+)") _ENCODE_RE_CLASS_BATCHES = re.compile(r"\[GLOBAL\] kind=class,\s*groups=\d+,\s*batches=(\d+)") _ENCODE_RE_FUNC_BATCHES = re.compile(r"\[GLOBAL\] kind=function,\s*groups=\d+,\s*batches=(\d+)") +_ENCODE_RE_CLASS_PROCESS = re.compile(r"\[GLOBAL\] process_class_batch:") +_ENCODE_RE_FUNC_PROCESS = re.compile(r"\[GLOBAL\] process_func_batch:") _ENCODE_RE_CLASS_FINISHED = re.compile(r"\[GLOBAL\] finished class batch with \d+ units") _ENCODE_RE_FUNC_FINISHED = re.compile(r"\[GLOBAL\] finished function batch with \d+ units") _ENCODE_RE_FILE_REMAP = re.compile(r"\[GLOBAL\] file=") @@ -1859,23 +1861,33 @@ def _parse_encoder_line(line: str, state: Dict[str, Any]) -> None: state["kind"] = "function" state["phase"] = "Parsing function batches" return - if "process_class_batch:" in line: + if _ENCODE_RE_CLASS_PROCESS.search(line): + state["class_done"] += 1 + if state.get("class_total"): + state["class_done"] = min(state["class_done"], state["class_total"]) + state["_class_counted_on_process"] = True state["kind"] = "class" state["phase"] = "Parsing class batches" return if _ENCODE_RE_CLASS_FINISHED.search(line): - state["class_done"] += 1 + if not state.get("_class_counted_on_process"): + state["class_done"] += 1 if state.get("class_total"): state["class_done"] = min(state["class_done"], state["class_total"]) state["kind"] = "class" state["phase"] = "Parsing class batches" return - if "process_func_batch:" in line: + if _ENCODE_RE_FUNC_PROCESS.search(line): + state["func_done"] += 1 + if state.get("func_total"): + state["func_done"] = min(state["func_done"], state["func_total"]) + state["_func_counted_on_process"] = True state["kind"] = "function" state["phase"] = "Parsing function batches" return if _ENCODE_RE_FUNC_FINISHED.search(line): - state["func_done"] += 1 + if not state.get("_func_counted_on_process"): + state["func_done"] += 1 if state.get("func_total"): state["func_done"] = min(state["func_done"], state["func_total"]) state["kind"] = "function" @@ -2239,7 +2251,8 @@ def _maybe_offer_initial_encode( # Fallback for environments where storage resolution fails; # err on the side of running the encoder rather than skipping it. rpg_file = project_path / ".cmind" / "data" / "rpg.json" - if rpg_file.exists(): + legacy_rpg_file = project_path / ".cmind" / "data" / "rpg.json" + if rpg_file.exists() or legacy_rpg_file.exists(): return if encode_choice is False: @@ -2339,11 +2352,10 @@ def _install_claude_hooks(project_path: Path) -> None: session_start = [] def _is_cmind_entry(entry: object) -> bool: - """Detect a previously-installed CoderMind SessionStart entry. + """Detect an existing CoderMind SessionStart entry. - Matches both the current (shlex-quoted) and earlier - (json.dumps-quoted) command shapes, plus any custom CoderMind - entry the user may have added that still calls update_graphs.py. + Matches the supported command shapes plus any custom CoderMind + entry the user may have added that still calls ``update_graphs.py``. """ if not isinstance(entry, dict): return False @@ -2478,13 +2490,11 @@ def _resolve_git_hooks_dir(project_path: Path) -> Optional[Path]: return None -# Each entry describes one shape of legacy (pre-sentinel) CoderMind snippet -# that may exist in a user's hook file from an older release. The first -# element is a substring of the snippet's first line (a marker comment); -# the second is the *total* number of consecutive lines that snippet -# occupies starting at the marker line. These are removed before the -# new sentinel block is written so users upgrading don't end up with the -# old snippet running alongside the new one. +# Each entry describes a CoderMind-owned hook snippet shape that can be +# recognized without sentinels. The first element is a substring of the +# snippet's marker comment; the second is the total number of consecutive +# lines occupied by that snippet. These are removed before the sentinel +# block is written so users do not end up with duplicate CoderMind logic. LegacyBlock = Tuple[str, int] @@ -2497,7 +2507,7 @@ def _strip_hook_block( Two cleanup passes: - 1. Strip the new-style sentinel block:: + 1. Strip the sentinel block:: # CMIND-BEGIN ... @@ -2506,11 +2516,11 @@ def _strip_hook_block( Range-based, so multi-line bodies of any shape are atomically removed in one shot. - 2. Strip each ``(marker_substring, line_count)`` legacy snippet - (the pre-sentinel format used through release v0.0.99-dev.72). - The marker line plus ``line_count - 1`` lines following it are - dropped. Multiple legacy shapes are removed in a single pass - so the order of entries in ``legacy_blocks`` doesn't matter. + 2. Strip each compatibility snippet described by + ``(marker_substring, line_count)``. The marker line plus + ``line_count - 1`` lines following it are dropped. Multiple + shapes are removed in a single pass so the order of entries in + ``legacy_blocks`` doesn't matter. Lines outside both passes are preserved verbatim so user-authored hook content (and shebangs) survive untouched. @@ -2534,7 +2544,7 @@ def _strip_hook_block( continue after_sentinels.append(line) - # Pass 2: strip legacy snippets by (marker, line_count). + # Pass 2: strip compatibility snippets by (marker, line_count). if not legacy_blocks: return "\n".join(after_sentinels) @@ -2597,11 +2607,9 @@ def _install_hook_snippet( The block is **atomically replaceable**: subsequent ``cmind init`` / ``cmind update`` runs find the existing sentinels and replace the - whole block, so behavior upgrades land cleanly without piling new - snippets on top of old ones. ``legacy_blocks`` is used **once** to - migrate pre-sentinel installs (released through v0.0.99-dev.72) onto - this scheme; once a user has been migrated their hook contains the - sentinels and the legacy patterns are no-ops. + whole block, so behavior upgrades land cleanly without duplicate + snippets. ``legacy_blocks`` recognizes CoderMind-owned hook bodies + that do not have sentinels yet. Creates the hook file with a ``#!/bin/sh`` shebang if absent; preserves any user-authored shebang otherwise. Always returns @@ -2633,14 +2641,10 @@ def _install_hook_snippet( def _uninstall_git_pre_commit_hook(project_path: Path) -> bool: - """Remove any previously-installed CoderMind ``pre-commit`` block. - - Pre-commit was retired in favour of ``post-commit`` only: the - pre-commit sync ran ``--staged-only`` and was immediately followed - by the full post-commit sync, so its output had a ~1 sec lifetime - and added latency to every ``git commit`` for no observable benefit. - Existing workspaces upgraded via ``cmind init`` / ``cmind update`` - have their pre-commit block stripped here; user-authored hook + """Remove any CoderMind-owned ``pre-commit`` block. + + The active git hook contract uses ``post-commit`` and ``post-merge``. + CoderMind-owned pre-commit blocks are stripped here; user-authored hook content (and other tools' blocks such as husky / pre-commit / lefthook) is preserved untouched. @@ -2692,7 +2696,7 @@ def _install_git_post_merge_hook(project_path: Path) -> bool: if hooks_dir is None: return False - # Level-1 hook: stub delegates to ``cmind hook post-merge``. + # Dispatcher stub delegates to ``cmind hook post-merge``. marker = "# CoderMind: post-merge dispatcher" body = ( f"{marker}\n" @@ -2711,30 +2715,29 @@ def _install_git_post_merge_hook(project_path: Path) -> bool: def _install_git_post_commit_hook(project_path: Path) -> bool: - """Install the Level-1 ``post-commit`` dispatcher stub. + """Install the ``post-commit`` dispatcher stub. - The on-disk hook is now a 3-line shell snippet that ``exec``s - ``cmind hook post-commit``. All orchestration lives in the + The on-disk hook is a short shell snippet that delegates to + ``cmind hook post-commit``. All orchestration lives in the :func:`hook` Python command: - * **Phase 1 (foreground)**: ``update_graphs.py sync`` advances - ``meta.git`` to the new HEAD. Output is teed into - ``~/.cmind/workspaces//logs/hooks.log``. + * **Foreground sync**: ``update_graphs.py sync`` advances + ``meta.git`` to the new HEAD. Output is teed into + ``~/.cmind/workspaces//logs/hooks.log``. - * **Phase 2 (background)**: ``update_graphs.py update-rpg`` is - detached via ``subprocess.Popen(start_new_session=True)``. A - mkdir-based directory lock at - ``~/.cmind/workspaces//logs/.update_rpg.lock`` serialises - overlapping commits; locks older than 60 minutes are treated as - orphaned and removed. The worker's stdout/stderr land in - ``~/.cmind/workspaces//logs/update_rpg.log``. + * **Background update**: ``update_graphs.py update-rpg`` is + detached via ``subprocess.Popen(start_new_session=True)``. A + mkdir-based directory lock at + ``~/.cmind/workspaces//logs/.update_rpg.lock`` serialises + overlapping commits; locks older than 60 minutes are treated as + orphaned and removed. The worker's stdout/stderr land in + ``~/.cmind/workspaces//logs/update_rpg.log``. - Both phases are best-effort: every failure path is swallowed inside + Both steps are best-effort: every failure path is swallowed inside :func:`hook` so a hook misbehaviour never blocks ``git commit``. - Legacy multi-line shell bodies from earlier releases (pre-Level-1) - are stripped on upgrade -- the ``legacy_blocks`` tuple below covers - every shape we've shipped. + CoderMind-owned multi-line shell bodies are stripped on upgrade by + the ``legacy_blocks`` compatibility patterns below. """ hooks_dir = _resolve_git_hooks_dir(project_path) if hooks_dir is None: @@ -2752,11 +2755,9 @@ def _install_git_post_commit_hook(project_path: Path) -> bool: "post-commit", body, legacy_blocks=( - # v1 (pre-Step-3): two-line sync-only snippet. + # Two-line sync-only snippet. ("# CoderMind: advance meta.git after commit", 2), - # v3 (release 0576393): five-line snippet with phase-1 sync - # + phase-2 setsid background under the same marker we used - # before Level-1. + # Five-line sync + setsid background-update snippet. ("# CoderMind: advance meta.git + background feature graph update", 5), ), ) @@ -4116,22 +4117,23 @@ def init( ) step_num += 1 - steps_lines.append(f"{step_num}. Start using slash commands with your AI agent:") + steps_lines.append(f"{step_num}. Start using high-level slash commands with your AI agent:") steps_lines.extend([ - f" {step_num}.1 [cyan]/cmind.feature_spec[/] - Create feature spec from docs", - f" {step_num}.2 [cyan]/cmind.feature_build[/] - Generate and Expand Feature Tree", - f" {step_num}.3 [cyan]/cmind.feature_refactor[/] - Refactor Feature Tree", - f" {step_num}.4 [cyan]/cmind.feature_edit[/] - Edit Feature Tree Nodes", - f" {step_num}.5 [cyan]/cmind.build_skeleton[/] - Repository Skeleton Structure", - f" {step_num}.6 [cyan]/cmind.build_data_flow[/] - Data Flow Design", - f" {step_num}.7 [cyan]/cmind.design_base_classes[/] - Base Classes Design", - f" {step_num}.8 [cyan]/cmind.design_interfaces[/] - Interface Design", - f" {step_num}.9 [cyan]/cmind.plan_tasks[/] - Task Planning", - f" {step_num}.10 [cyan]/cmind.code_gen[/] - Code Generation", - f" {step_num}.11 [cyan]/cmind.rpg_edit[/] - Surgical RPG/code edit", - f" {step_num}.12 [cyan]/cmind.encode[/] - Encode repo into RPG", - f" {step_num}.13 [cyan]/cmind.update_rpg[/] - Incremental RPG update", + " For new projects / requirements-to-code:", + f" {step_num}.1 [cyan]/cmind.feature_construct [/] - Build the feature tree from requirements", + f" {step_num}.2 [dim][Optional][/dim] [cyan]/cmind.feature_edit [/] - Edit Feature Tree Nodes", + f" {step_num}.3 [cyan]/cmind.plan[/] - Run RPG construction and planning", + f" {step_num}.4 [cyan]/cmind.code_gen[/] - Code Generation", + f" {step_num}.5 [dim][Optional][/dim] [cyan]/cmind.rpg_edit [/] - Surgical RPG/code edit", + "", + " For existing repositories / code-to-RPG:", + f" {step_num}.6 [cyan]/cmind.encode[/] - Encode an existing repo into RPG", + f" {step_num}.7 [cyan]/cmind.update_rpg[/] - Manual incremental RPG update fallback", + f" {step_num}.8 [dim][Optional][/dim] [cyan]/cmind.rpg_edit [/] - Surgical RPG/code edit", + "", + " For finer-grained commands and stage-by-stage reruns, see:", + " [link=https://github.com/microsoft/RPG-ZeroRepo/blob/main/CoderMind/docs/commands.md]https://github.com/microsoft/RPG-ZeroRepo/blob/main/CoderMind/docs/commands.md[/link]", ]) step_num += 1 @@ -4551,12 +4553,8 @@ def update( tracker.start("copilot-cli-mcp") _register_copilot_cli_global_mcp(tracker=tracker) - # Re-install hooks so behavior fixes propagate to existing - # workspaces. Without this, the .git/hooks/* files stay - # frozen at whatever version was active during the original - # `cmind init`, and the sentinel-block migration in - # _install_hook_snippet (the upgrade mechanism for hooks) - # never gets a chance to run. + # Reconcile hook files so existing workspaces receive the + # current post-commit/post-merge dispatcher contract. _install_hooks(project_path, selected_ai, tracker=tracker) tracker.complete("final", "update complete") @@ -5001,13 +4999,13 @@ def hook(name: str = typer.Argument(..., help="Hook name: post-commit | post-mer "sync", ) elif name == "post-commit": - # Phase 1: synchronous meta.git advance (fast, ~50ms). + # Fast foreground sync keeps meta.git aligned with HEAD. _hook_run_foreground( ws, log_path, env, ["update_graphs.py", "sync"], - "phase1-sync", + "foreground-sync", ) - # Phase 2: detached background LLM-driven RPG update. + # The LLM-driven RPG update runs detached from git commit. _hook_spawn_background(ws, home_dir, log_path, env) else: _hook_log_line(log_path, f"unknown hook name: {name!r}") diff --git a/CoderMind/templates/commands/code_gen.md b/CoderMind/templates/commands/code_gen.md index 40df473..4d54299 100644 --- a/CoderMind/templates/commands/code_gen.md +++ b/CoderMind/templates/commands/code_gen.md @@ -184,6 +184,9 @@ cmind script run_batch.py --retry --json # Run a specific batch by ID cmind script run_batch.py --batch-id --json +# Run a bounded smoke sample of the next N batches +cmind script run_batch.py --loop --max-batches --json + # Repo validation (pytest + smoke) cmind script run_batch.py --final-test --json diff --git a/CoderMind/tests/test_e2e.py b/CoderMind/tests/test_e2e.py index 18e0c0f..14a393e 100644 --- a/CoderMind/tests/test_e2e.py +++ b/CoderMind/tests/test_e2e.py @@ -598,12 +598,12 @@ def test_full_encode_search_update_cycle(self, encoded_rpg, cmind_dir): """Complete lifecycle test: encode, search, update, save, load.""" rpg = encoded_rpg - # Phase 1: Encode is already done (encoded_rpg fixture) + # Encode step is already done by the encoded_rpg fixture. assert rpg.repo_name == "sample_repo" areas_initial = rpg.get_functional_areas() node_count_initial = len(rpg.nodes) - # Phase 2: Search the encoded RPG + # Search the encoded RPG. from rpg_agent.ops.search_by_feature import ( exact_match_search_feature, ) @@ -611,7 +611,7 @@ def test_full_encode_search_update_cycle(self, encoded_rpg, cmind_dir): results = exact_match_search_feature(rpg, "validate_email") assert len(results) > 0 - # Phase 3: Update the RPG with new code + # Update the RPG with new code. new_code = textwrap.dedent("""\ import logging @@ -635,11 +635,11 @@ def log_error(self, user_id: int, error: str): audit_node = rpg.find_node_by_path("src/audit.py::AuditLogger") assert audit_node is not None - # Phase 4: Search again (should find new nodes) + # Search again; the generated node should be discoverable. results = exact_match_search_feature(rpg, "AuditLogger") assert len(results) > 0 - # Phase 5: Save the RPG + # Save the RPG. save_result = WorkflowIntegration.save_rpg( rpg=rpg, cmind_dir=cmind_dir, @@ -648,7 +648,7 @@ def log_error(self, user_id: int, error: str): ) assert os.path.isfile(save_result["rpg_path"]) - # Phase 6: Load and verify + # Load and verify. loaded = WorkflowIntegration.load_rpg(cmind_dir) assert loaded is not None assert loaded.repo_name == "sample_repo" @@ -657,7 +657,7 @@ def log_error(self, user_id: int, error: str): audit_loaded = loaded.find_node_by_path("src/audit.py::AuditLogger") assert audit_loaded is not None - # Phase 7: prepare_for_codegen on the loaded RPG + # Prepare code-generation context from the loaded RPG. context = WorkflowIntegration.prepare_for_codegen(rpg=loaded) assert context["repo_name"] == "sample_repo" assert "existing_interfaces" in context diff --git a/CoderMind/tests/test_encode_commands.py b/CoderMind/tests/test_encode_commands.py index f05b9fe..85ca8cb 100644 --- a/CoderMind/tests/test_encode_commands.py +++ b/CoderMind/tests/test_encode_commands.py @@ -347,7 +347,7 @@ def test_update_rpg_template_references_check_script(self): with open(update_md, "r", encoding="utf-8") as f: content = f.read() assert "check_encode.py" in content - assert "run_update_rpg.py" in content + assert "update_graphs.py update-rpg" in content # ============================================================================ diff --git a/CoderMind/tests/test_feature_construct_orchestrator.py b/CoderMind/tests/test_feature_construct_orchestrator.py index 6e6c965..e97f31d 100644 --- a/CoderMind/tests/test_feature_construct_orchestrator.py +++ b/CoderMind/tests/test_feature_construct_orchestrator.py @@ -1,4 +1,4 @@ -"""Unit tests for the Phase 1 feature construction orchestrator.""" +"""Unit tests for the feature construction orchestrator.""" from __future__ import annotations @@ -50,7 +50,12 @@ def _write_text(path: Path, text: str) -> None: def _valid_feature_spec() -> dict[str, object]: return { - "meta": {"generated_at": "2026-05-25", "project_types": ["CLI"]}, + "meta": { + "generated_at": "2026-05-25", + "project_types": ["CLI"], + "primary_language": "python", + "target_languages": ["python"], + }, "repository_name": "sample-cli", "repository_purpose": "Build a sample CLI.", "background_and_overview": [{"id": "BG-001", "description": "Users need a CLI."}], @@ -59,6 +64,23 @@ def _valid_feature_spec() -> dict[str, object]: } +def _valid_feature_build(language: str = "python") -> dict[str, object]: + return { + "feature_tree": {}, + "meta": {"primary_language": language, "target_languages": [language]}, + } + + +def _valid_feature_tree( + language: str = "python", + component_name: str = "core", +) -> dict[str, object]: + return { + "components": [{"name": component_name}], + "meta": {"primary_language": language, "target_languages": [language]}, + } + + def _states(types: list[str]) -> list["feature_construct.StageState"]: assert len(types) == len(feature_construct.STAGES) return [ @@ -88,8 +110,8 @@ def test_missing_artifacts_are_incomplete(self, artifact_paths: dict[str, Path]) def test_valid_artifacts_are_complete(self, artifact_paths: dict[str, Path]) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) - _write_json(artifact_paths["feature_build"], {"feature_tree": {}}) - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "core"}]}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree()) states = feature_construct.probe() assert [state.type for state in states] == ["update", "update", "update"] @@ -105,6 +127,57 @@ def test_feature_spec_requires_downstream_fields(self, artifact_paths: dict[str, assert state.done is False assert "functional_requirements" in state.message + def test_feature_spec_requires_language_fields(self, artifact_paths: dict[str, Path]) -> None: + spec = _valid_feature_spec() + meta = spec["meta"] + assert isinstance(meta, dict) + meta.pop("primary_language") + meta.pop("target_languages") + _write_json(artifact_paths["feature_spec"], spec) + + state = feature_construct.probe()[0] + + assert state.type == "warning" + assert state.done is False + assert "meta.primary_language" in state.message + assert "meta.target_languages" in state.message + + def test_feature_build_preserves_feature_spec_language(self, artifact_paths: dict[str, Path]) -> None: + spec = _valid_feature_spec() + spec["meta"] = {**spec["meta"], "primary_language": "go", "target_languages": ["go"]} + _write_json(artifact_paths["feature_spec"], spec) + _write_json( + artifact_paths["feature_build"], + { + "feature_tree": {}, + "meta": {"primary_language": "python", "target_languages": ["python"]}, + }, + ) + + state = feature_construct.probe()[1] + + assert state.type == "warning" + assert state.done is False + assert "expected 'go'" in state.message + + def test_feature_refactor_preserves_feature_spec_language(self, artifact_paths: dict[str, Path]) -> None: + spec = _valid_feature_spec() + spec["meta"] = {**spec["meta"], "primary_language": "go", "target_languages": ["go"]} + _write_json(artifact_paths["feature_spec"], spec) + _write_json( + artifact_paths["feature_refactor"], + { + "components": [{"name": "core"}], + "meta": {"primary_language": "go", "target_languages": []}, + }, + ) + + state = feature_construct.probe()[2] + + assert state.type == "warning" + assert state.done is False + assert "target_languages" in state.message + def test_feature_refactor_requires_non_empty_components(self, artifact_paths: dict[str, Path]) -> None: _write_json(artifact_paths["feature_refactor"], {"components": []}) @@ -117,7 +190,7 @@ def test_feature_refactor_requires_non_empty_components(self, artifact_paths: di class TestCheckOnlyJson: def test_json_payload_reports_progress(self, artifact_paths: dict[str, Path], capsys: pytest.CaptureFixture[str]) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) - _write_json(artifact_paths["feature_build"], {"feature_tree": {}}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) rc = feature_construct.main(["--check-only", "--json"]) captured = capsys.readouterr() @@ -134,6 +207,7 @@ def test_json_payload_reports_progress(self, artifact_paths: dict[str, Path], ca "feature_refactor", ] assert [stage["done"] for stage in payload["stages"]] == [True, True, False] + assert payload["stages"][0]["details"]["primary_language"] == "python" class TestExecutionReset: @@ -144,7 +218,7 @@ def test_force_removes_stale_output_sensitive_artifacts_before_stage_invocation( ) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) _write_json(artifact_paths["feature_build"], {"stale": "build"}) - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "stale"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="stale")) calls: list[str] = [] def fake_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: @@ -153,10 +227,10 @@ def fake_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> in _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) elif script_name == "feature_build.py": assert not artifact_paths["feature_build"].exists() - _write_json(artifact_paths["feature_build"], {"feature_tree": {"fresh": True}}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) elif script_name == "feature_refactor.py": assert not artifact_paths["feature_refactor"].exists() - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "fresh"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="fresh")) return 0 monkeypatch.setattr(feature_construct, "_run_stage", fake_run_stage) @@ -175,7 +249,7 @@ def test_cascade_removes_stale_downstream_artifacts_before_stage_invocation( spec.pop("repository_purpose") _write_json(artifact_paths["feature_spec"], spec) _write_json(artifact_paths["feature_build"], {"stale": "build"}) - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "stale"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="stale")) calls: list[str] = [] def fake_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: @@ -184,10 +258,10 @@ def fake_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> in _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) elif script_name == "feature_build.py": assert not artifact_paths["feature_build"].exists() - _write_json(artifact_paths["feature_build"], {"feature_tree": {"fresh": True}}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) elif script_name == "feature_refactor.py": assert not artifact_paths["feature_refactor"].exists() - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "fresh"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="fresh")) return 0 monkeypatch.setattr(feature_construct, "_run_stage", fake_run_stage) @@ -204,17 +278,17 @@ def test_invalid_output_sensitive_artifact_is_removed_before_stage_invocation( ) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) _write_text(artifact_paths["feature_build"], "{") - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "stale"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="stale")) calls: list[str] = [] def fake_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: calls.append(script_name) if script_name == "feature_build.py": assert not artifact_paths["feature_build"].exists() - _write_json(artifact_paths["feature_build"], {"feature_tree": {"fresh": True}}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) elif script_name == "feature_refactor.py": assert not artifact_paths["feature_refactor"].exists() - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "fresh"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="fresh")) return 0 monkeypatch.setattr(feature_construct, "_run_stage", fake_run_stage) @@ -230,8 +304,8 @@ def test_all_up_to_date_skip_path_does_not_remove_artifacts( monkeypatch: pytest.MonkeyPatch, ) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) - _write_json(artifact_paths["feature_build"], {"feature_tree": {}}) - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "core"}]}) + _write_json(artifact_paths["feature_build"], _valid_feature_build()) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree()) def fail_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: pytest.fail(f"unexpected stage run: {script_name}") @@ -253,7 +327,7 @@ def test_check_only_does_not_remove_artifacts_or_run_stages( ) -> None: _write_text(artifact_paths["feature_spec"], "{") _write_json(artifact_paths["feature_build"], {"stale": "build"}) - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "stale"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="stale")) def fail_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: pytest.fail(f"unexpected stage run: {script_name}") @@ -275,7 +349,7 @@ def test_dry_run_does_not_remove_artifacts_or_run_stages( ) -> None: _write_json(artifact_paths["feature_spec"], _valid_feature_spec()) _write_text(artifact_paths["feature_build"], "{") - _write_json(artifact_paths["feature_refactor"], {"components": [{"name": "stale"}]}) + _write_json(artifact_paths["feature_refactor"], _valid_feature_tree(component_name="stale")) def fail_run_stage(invoker: list[str], script_name: str, extra: list[str]) -> int: pytest.fail(f"unexpected stage run: {script_name}") diff --git a/CoderMind/tests/test_hooks_install.py b/CoderMind/tests/test_hooks_install.py index 07cc95a..a222fd7 100644 --- a/CoderMind/tests/test_hooks_install.py +++ b/CoderMind/tests/test_hooks_install.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Tests for CoderMind hook installation (Claude SessionStart, Copilot folderOpen task, and git pre-commit) and the ``update_graphs.py status`` subcommand the hooks invoke. +"""Tests for CoderMind hook installation and status loading. Verifies: - ``_install_claude_hooks`` writes a SessionStart hook that calls @@ -7,8 +7,9 @@ - ``_install_copilot_hooks`` writes a VS Code task with ``runOptions.runOn = "folderOpen"``, is idempotent, and preserves pre-existing user tasks. - - ``_install_hooks`` dispatches the right AI-specific installer and - also wires up the git pre-commit hook when a ``.git`` dir exists. + - ``_install_hooks`` dispatches the right AI-specific installer, + installs git post-commit/post-merge dispatchers, and removes + CoderMind-owned pre-commit blocks when a ``.git`` dir exists. - ``update_graphs.py status`` returns RPG/dep-graph stats + an agent-facing MCP-tools reminder, on both populated and empty workspaces. @@ -77,8 +78,8 @@ def test_install_claude_hooks_is_idempotent_across_python_upgrades(project, monk (not a duplicate per invocation). """ cmind_cli._install_claude_hooks(project) - # Simulate any environment change that previously affected hook content; - # the new hook body is interpreter-independent so this should be a no-op. + # Simulate an environment change; the hook body is + # interpreter-independent so this should be a no-op. monkeypatch.setattr(cmind_cli.sys, "executable", "/opt/new-python/bin/python") cmind_cli._install_claude_hooks(project) data = json.loads((project / ".claude" / "settings.json").read_text()) @@ -97,10 +98,8 @@ def test_install_claude_hooks_is_idempotent_across_python_upgrades(project, monk def test_install_claude_hooks_shell_escapes_special_chars(project, monkeypatch): """Interpreter / workspace paths must not appear in the hook command. - Previously the hook embedded ``sys.executable`` and the workspace - script path, requiring ``shlex.quote`` to survive spaces. The new - hook body invokes the global ``cmind`` CLI directly, so paths with - special characters can't end up inside the command string. + The hook body invokes the global ``cmind`` CLI directly, so paths + with special characters cannot end up inside the command string. """ monkeypatch.setattr( cmind_cli.sys, "executable", "/path with space/python" @@ -193,7 +192,6 @@ def test_install_copilot_hooks_preserves_user_tasks(project): # --------------------------------------------------------------------------- def test_install_hooks_dispatches_to_copilot(project, monkeypatch): - # Pretend the project is a git repo so the pre-commit installer fires. (project / ".git" / "hooks").mkdir(parents=True) cmind_cli._install_hooks(project, "copilot", tracker=None) @@ -201,13 +199,14 @@ def test_install_hooks_dispatches_to_copilot(project, monkeypatch): # Copilot tasks.json present, Claude settings.json absent. assert (project / ".vscode" / "tasks.json").is_file() assert not (project / ".claude" / "settings.json").exists() - # Pre-commit hook installed. - pre = (project / ".git" / "hooks" / "pre-commit").read_text() - assert "CoderMind: incremental RPG sync on commit" in pre - assert "update_graphs.py" in pre and "sync" in pre - # Hook must pass ``--staged-only`` so it doesn't pull working-tree - # changes that the user hasn't ``git add``'d. - assert "--staged-only" in pre + hooks_dir = project / ".git" / "hooks" + post_commit = (hooks_dir / "post-commit").read_text() + post_merge = (hooks_dir / "post-merge").read_text() + assert "CoderMind: post-commit dispatcher" in post_commit + assert "cmind hook post-commit" in post_commit + assert "CoderMind: post-merge dispatcher" in post_merge + assert "cmind hook post-merge" in post_merge + assert not (hooks_dir / "pre-commit").exists() def test_install_hooks_dispatches_to_claude(project): @@ -217,17 +216,18 @@ def test_install_hooks_dispatches_to_claude(project): assert (project / ".claude" / "settings.json").is_file() assert not (project / ".vscode" / "tasks.json").exists() - assert (project / ".git" / "hooks" / "pre-commit").is_file() + hooks_dir = project / ".git" / "hooks" + assert (hooks_dir / "post-commit").is_file() + assert (hooks_dir / "post-merge").is_file() + assert not (hooks_dir / "pre-commit").exists() def test_update_command_invokes_install_hooks(): """Regression tripwire: ``cmind update`` must call ``_install_hooks``. - Previously ``update`` re-downloaded templates / refreshed gitignore - / regenerated MCP config but silently *skipped* hook installation. - Result: users running ``cmind update`` after upgrading the CLI - never received hook fixes \u2014 ``.git/hooks/*`` stayed frozen at - whatever version was active during the original ``cmind init``. + Hook installation belongs in the update flow alongside template, + gitignore, and MCP config refreshes, so existing workspaces receive + hook dispatcher fixes when users run ``cmind update``. This is a static-source assertion rather than an end-to-end test because ``update`` does network I/O (template download) that is @@ -249,16 +249,11 @@ def test_update_command_invokes_install_hooks(): # --------------------------------------------------------------------------- -# Sentinel-block upgrade migration (regression for P0-4) +# Sentinel-block upgrade migration # --------------------------------------------------------------------------- # -# Prior to the sentinel-block design, ``_install_hook_snippet`` returned -# early as soon as any known marker (current or legacy) appeared in the -# hook file. Combined with marker renames between releases, this meant -# every upgrade was a silent no-op: users kept whatever they were first -# installed with, and never picked up new behavior. The tests below -# pin the upgrade semantics: a fresh install must REPLACE any prior -# CoderMind-owned content rather than refusing to write or stacking copies. +# The installer must replace CoderMind-owned content by sentinel range or +# compatibility marker, while preserving user-authored shell lines. def _hooks_dir(project): @@ -267,8 +262,8 @@ def _hooks_dir(project): return hd -def test_pre_commit_v1_legacy_is_replaced_on_upgrade(project): - """v1 pre-commit shipped a 2-line full-sync snippet; the current installer must remove it and write the new sentinel-wrapped block.""" +def test_pre_commit_v1_legacy_is_removed_on_upgrade(project): + """A CoderMind-owned pre-commit snippet is removed during hook setup.""" hd = _hooks_dir(project) (hd / "pre-commit").write_text( "#!/bin/sh\n" @@ -276,21 +271,12 @@ def test_pre_commit_v1_legacy_is_replaced_on_upgrade(project): "/old/python /old/update_graphs.py sync 2>/dev/null || true\n" ) - assert cmind_cli._install_git_pre_commit_hook(project) is True - text = (hd / "pre-commit").read_text() - - # Old marker + old command line are gone. - assert "# CoderMind: full RPG sync on commit" not in text - assert "/old/python" not in text - # New sentinel-wrapped block is present exactly once. - assert text.count("# CMIND-BEGIN pre-commit") == 1 - assert text.count("# CMIND-END pre-commit") == 1 - assert "# CoderMind: incremental RPG sync on commit" in text - assert "--staged-only" in text + assert cmind_cli._uninstall_git_pre_commit_hook(project) is True + assert not (hd / "pre-commit").exists() def test_post_commit_v1_legacy_is_replaced_on_upgrade(project): - """v1 post-commit shipped a 2-line sync-only snippet under the ``advance meta.git after commit`` marker. Must be replaced by the current 2-phase (sync + background update-rpg) sentinel block.""" + """A sync-only post-commit snippet upgrades to the dispatcher block.""" hd = _hooks_dir(project) (hd / "post-commit").write_text( "#!/bin/sh\n" @@ -304,16 +290,13 @@ def test_post_commit_v1_legacy_is_replaced_on_upgrade(project): assert "# CoderMind: advance meta.git after commit" not in text assert "/old/python" not in text assert text.count("# CMIND-BEGIN post-commit") == 1 - assert "update-rpg" in text # phase 2 is now present + assert text.count("# CMIND-END post-commit") == 1 + assert "CoderMind: post-commit dispatcher" in text + assert "cmind hook post-commit" in text def test_post_commit_v3_legacy_is_replaced_on_upgrade(project): - """v3 (release 0576393) shipped a 5-line setsid+lock snippet WITHOUT sentinels. The new installer must recognise its marker and line count and replace the whole block in place. - - This is the case that motivated the sentinel-block refactor: under - the old marker-substring dedupe, the v3 marker matching itself made - every subsequent install a no-op. - """ + """A multi-line post-commit snippet upgrades to the dispatcher block.""" hd = _hooks_dir(project) old_body = ( "#!/bin/sh\n" @@ -330,55 +313,48 @@ def test_post_commit_v3_legacy_is_replaced_on_upgrade(project): assert cmind_cli._install_git_post_commit_hook(project) is True text = (hd / "post-commit").read_text() - # Old paths are gone — proves the v3 block was actually stripped. assert "/old/python" not in text assert "/old/.lock" not in text - # New sentinel block is present exactly once (no duplicate piling). assert text.count("# CMIND-BEGIN post-commit") == 1 assert text.count("# CMIND-END post-commit") == 1 - # Current marker survives inside the new block. - assert text.count( - "# CoderMind: advance meta.git + background feature graph update" - ) == 1 + assert text.count("# CoderMind: post-commit dispatcher") == 1 + assert "cmind hook post-commit" in text def test_install_is_idempotent_under_sentinels(project): - """Repeated installs must not stack sentinel blocks or duplicate content — the second install replaces the first verbatim.""" + """Repeated dispatcher installs must not stack sentinel blocks.""" hd = _hooks_dir(project) - cmind_cli._install_git_pre_commit_hook(project) - first = (hd / "pre-commit").read_text() - cmind_cli._install_git_pre_commit_hook(project) - cmind_cli._install_git_pre_commit_hook(project) - third = (hd / "pre-commit").read_text() + cmind_cli._install_git_post_commit_hook(project) + first = (hd / "post-commit").read_text() + cmind_cli._install_git_post_commit_hook(project) + cmind_cli._install_git_post_commit_hook(project) + third = (hd / "post-commit").read_text() assert first == third - assert third.count("# CMIND-BEGIN pre-commit") == 1 - assert third.count("# CMIND-END pre-commit") == 1 + assert third.count("# CMIND-BEGIN post-commit") == 1 + assert third.count("# CMIND-END post-commit") == 1 def test_sentinel_block_is_atomically_replaceable(project): - """If a future release changes the body inside the block, the sentinel-pair range is replaced wholesale. Simulated here by hand-writing an "old" block (different body content) and asserting that the install replaces it.""" + """The sentinel-pair range is replaced wholesale on install.""" hd = _hooks_dir(project) - (hd / "pre-commit").write_text( + (hd / "post-commit").write_text( "#!/bin/sh\n" "\n" - "# CMIND-BEGIN pre-commit\n" - "# CoderMind: incremental RPG sync on commit\n" + "# CMIND-BEGIN post-commit\n" + "# CoderMind: post-commit dispatcher\n" "/some/older/path/python /some/older/script.py sync --legacy-flag\n" - "# CMIND-END pre-commit\n" + "# CMIND-END post-commit\n" ) - assert cmind_cli._install_git_pre_commit_hook(project) is True - text = (hd / "pre-commit").read_text() + assert cmind_cli._install_git_post_commit_hook(project) is True + text = (hd / "post-commit").read_text() - # Old body content gone. assert "/some/older/path/python" not in text assert "--legacy-flag" not in text - # Exactly one sentinel pair. - assert text.count("# CMIND-BEGIN pre-commit") == 1 - assert text.count("# CMIND-END pre-commit") == 1 - # New body present. - assert "--staged-only" in text + assert text.count("# CMIND-BEGIN post-commit") == 1 + assert text.count("# CMIND-END post-commit") == 1 + assert "cmind hook post-commit" in text def test_user_authored_content_outside_block_is_preserved(project): @@ -394,13 +370,14 @@ def test_user_authored_content_outside_block_is_preserved(project): "echo 'user-postlude: still going' >&2\n" ) - assert cmind_cli._install_git_pre_commit_hook(project) is True + assert cmind_cli._uninstall_git_pre_commit_hook(project) is True text = (hd / "pre-commit").read_text() assert "user-prelude" in text assert "user-postlude" in text - # And the CoderMind content was actually upgraded (old python path gone). assert "/old/python" not in text + assert "# CMIND-BEGIN pre-commit" not in text + assert "# CMIND-END pre-commit" not in text # --------------------------------------------------------------------------- @@ -510,13 +487,13 @@ def test_setup_gitignore_greenfield_writes_full_template(tmp_path): # Python conventions (matches github/gitignore/Python.gitignore verbatim) assert "__pycache__/" in content assert ".venv" in content # upstream uses ``.venv`` (no trailing slash) - # Sections that ONLY exist in the full GitHub template (regression - # guard for the slimmed-down version we used previously). + # Sections that only exist in the full GitHub template. assert "PyInstaller" in content assert "Jupyter Notebook" in content assert ".ipynb_checkpoints" in content # CoderMind common (runtime + machine-specific) - assert ".cmind/" in content + assert ".cmind/*" in content + assert "!.cmind/config.toml" in content assert ".vscode/mcp.json" in content assert ".vscode/tasks.json" in content assert ".mcp.json" in content @@ -543,7 +520,7 @@ def test_setup_gitignore_existing_git_no_ignore_writes_cmind_only(tmp_path): cmind_cli._setup_gitignore(tmp_path, "copilot") content = (tmp_path / ".gitignore").read_text() # CoderMind rules present - assert ".cmind/" in content + assert ".cmind/*" in content assert ".github/agents/" in content # Python conventions NOT imposed on existing repo assert "__pycache__/" not in content @@ -562,7 +539,7 @@ def test_setup_gitignore_existing_gitignore_preserves_user_entries(tmp_path): assert "node_modules/" in content assert "*.tmp" in content # CoderMind rules appended - assert ".cmind/" in content + assert ".cmind/*" in content assert ".github/agents/" in content @@ -575,24 +552,20 @@ def test_setup_gitignore_is_idempotent(tmp_path): assert first == second # second call is a no-op # No duplicate CoderMind header assert second.count(cmind_cli._GITIGNORE_CMIND_HEADER) == 1 - # No duplicate .cmind/ directory entry. Count actual lines (after - # stripping) because the appended block also contains - # `!.cmind/config.toml` which holds .cmind/ as a substring. + # No duplicate runtime-directory glob entry. lines = [l.strip() for l in second.splitlines()] - assert lines.count(".cmind/") == 1 + assert lines.count(".cmind/*") == 1 def test_setup_gitignore_partial_existing_rules_only_appends_missing(tmp_path): """If user already has SOME CoderMind rules, only missing ones get appended.""" - # User has manually added .cmind/ but nothing else - (tmp_path / ".gitignore").write_text(".cmind/\n") + # User has manually added the runtime-directory glob but nothing else. + (tmp_path / ".gitignore").write_text(".cmind/*\n") cmind_cli._setup_gitignore(tmp_path, "copilot") content = (tmp_path / ".gitignore").read_text() - # .cmind/ directory entry must NOT be duplicated. Compare exact - # lines (after stripping) because the appended block also contains - # `!.cmind/config.toml` which holds .cmind/ as a substring. + # The runtime-directory glob must not be duplicated. lines = [l.strip() for l in content.splitlines()] - assert lines.count(".cmind/") == 1 + assert lines.count(".cmind/*") == 1 # The new managed config.toml un-ignore line is present assert "!.cmind/config.toml" in lines # Missing rules are now present diff --git a/CoderMind/tests/test_initial_encode_prompt.py b/CoderMind/tests/test_initial_encode_prompt.py index 9bee89f..4d3efe2 100644 --- a/CoderMind/tests/test_initial_encode_prompt.py +++ b/CoderMind/tests/test_initial_encode_prompt.py @@ -321,7 +321,7 @@ def test_run_initial_encode_success_writes_log(tmp_path): stdout_text='{"status": "success"}\n', ) assert cmind_cli._run_initial_encode(tmp_path) is True - log = tmp_path / ".cmind" / "logs" / "encode.log" + log = cmind_cli._storage.workspace_logs_dir(tmp_path) / "encode.log" assert log.is_file() contents = log.read_text() assert "Generating repo info" in contents @@ -337,6 +337,6 @@ def test_run_initial_encode_failure_returns_false(tmp_path): stdout_text='{"status": "failed", "error": "boom"}\n', ) assert cmind_cli._run_initial_encode(tmp_path) is False - log = tmp_path / ".cmind" / "logs" / "encode.log" + log = cmind_cli._storage.workspace_logs_dir(tmp_path) / "encode.log" assert log.is_file() assert "boom" in log.read_text() diff --git a/CoderMind/tests/test_integration.py b/CoderMind/tests/test_integration.py index cbf5d9d..b22da59 100644 --- a/CoderMind/tests/test_integration.py +++ b/CoderMind/tests/test_integration.py @@ -338,7 +338,7 @@ def test_parsed_tree_updates_existing_rpg(self, rpg_with_structure, sample_parse # feature value as the node name, not the key after "function ") new_nodes = [ n for n in rpg.nodes.values() - if n.meta and n.meta.path == "src/user_manager.py:new_utility" + if n.meta and n.meta.path == "src/user_manager.py::new_utility" ] assert len(new_nodes) == 1 # The name is taken from the feature list @@ -585,16 +585,9 @@ def test_delete_files_and_clean_empty_parents(self): def test_evolution_process_diff_no_changes(self, rpg_with_structure): """process_diff with no changes returns the RPG unchanged.""" with patch( - "rpg_encoder.rpg_evolution.RPGParser", - ) as MockParser, \ - patch( "rpg_encoder.rpg_evolution.generate_detailed_diff", return_value={"added": {}, "deleted": {}, "modified": {}}, ): - mock_instance = MagicMock() - mock_instance.exclude_irrelevant_files.return_value = [] - MockParser.return_value = mock_instance - result = RPGEvolution.process_diff( repo_name="test_project", repo_info="test", diff --git a/CoderMind/tests/test_plan_orchestrator.py b/CoderMind/tests/test_plan_orchestrator.py index ced6133..60220ca 100644 --- a/CoderMind/tests/test_plan_orchestrator.py +++ b/CoderMind/tests/test_plan_orchestrator.py @@ -87,16 +87,14 @@ def test_cascade_forces_downstream_even_if_update(self) -> None: # downstream reasons should mention cascade assert "upstream" in states[1].reason - def test_warning_is_treated_as_done(self) -> None: - # ``warning`` means the artefact is present and usable, only a - # soft inconsistency was flagged. decide() must NOT rebuild it - # (otherwise re-running plan.py would loop forever on a stage - # whose check perpetually emits warning). + def test_warning_is_treated_as_incomplete(self) -> None: + # A warning means the artefact violates a cross-stage contract. + # Rebuild from that stage so bench cannot report a false PASS for + # a partial plan. states = _states(["update", "warning", "update", "update", "update"]) plan.decide(states, force=False) - assert [s.will_run for s in states] == [False, False, False, False, False] - # The warning state should still be visible in the reason string. - assert "warning" in states[1].reason + assert [s.will_run for s in states] == [False, True, True, True, True] + assert states[1].reason == "type=warning" def test_force_runs_everything(self) -> None: states = _states(["update"] * 5) diff --git a/CoderMind/tests/test_rpg_evolution.py b/CoderMind/tests/test_rpg_evolution.py index 2cff994..e504485 100644 --- a/CoderMind/tests/test_rpg_evolution.py +++ b/CoderMind/tests/test_rpg_evolution.py @@ -590,21 +590,21 @@ class TestRPGEvolutionUpdateDepGraph: def test_update_dep_graph_index_no_crash(self, simple_rpg): logger = logging.getLogger("test_dep") - # Mock dep_graph to avoid needing a real repo - with patch.object(RPG, "parse_dep_graph") as mock_parse: - mock_dg = MagicMock() - mock_dg.G.nodes.return_value = ["n1", "n2"] - mock_parse.return_value = mock_dg - simple_rpg.dep_graph = mock_dg - simple_rpg._dep_to_rpg_map = {"n1": ["a"]} - + with patch("rpg.service.RPGService.refresh_dep_graph") as mock_refresh: RPGEvolution._update_dep_graph_index(simple_rpg, "/tmp/fake", logger) - mock_parse.assert_called_once() + mock_refresh.assert_called_once_with( + code_dir="/tmp/fake", + workspace_root="/tmp/fake", + save_path=None, + ) def test_update_dep_graph_handles_error(self, simple_rpg): logger = logging.getLogger("test_dep_err") - with patch.object(RPG, "parse_dep_graph", side_effect=RuntimeError("fail")): + with patch( + "rpg.service.RPGService.refresh_dep_graph", + side_effect=RuntimeError("fail"), + ): # Should not raise RPGEvolution._update_dep_graph_index(simple_rpg, "/tmp/fake", logger) @@ -615,18 +615,11 @@ class TestRPGEvolutionProcessDiff: def test_no_changes_detected(self, simple_rpg): """When diff detects no changes, RPG should be returned unchanged.""" with patch( - "rpg_encoder.rpg_evolution.RPGParser", - ) as MockParser, \ - patch( "rpg_encoder.rpg_evolution.generate_detailed_diff", return_value={"added": {}, "deleted": {}, "modified": {}}, ), \ patch.object(RPG, "parse_dep_graph"): - mock_instance = MagicMock() - mock_instance.exclude_irrelevant_files.return_value = [] - MockParser.return_value = mock_instance - result = RPGEvolution.process_diff( repo_name="test", repo_info="Test repo", @@ -649,17 +642,10 @@ def test_delete_only(self, simple_rpg): } with patch( - "rpg_encoder.rpg_evolution.RPGParser", - ) as MockParser, \ - patch( "rpg_encoder.rpg_evolution.generate_detailed_diff", return_value=diff_result, ): - mock_instance = MagicMock() - mock_instance.exclude_irrelevant_files.return_value = [] - MockParser.return_value = mock_instance - result = RPGEvolution.process_diff( repo_name="test", repo_info="Test repo", @@ -686,17 +672,10 @@ def test_save_path_creates_file(self, simple_rpg): } with patch( - "rpg_encoder.rpg_evolution.RPGParser", - ) as MockParser, \ - patch( "rpg_encoder.rpg_evolution.generate_detailed_diff", return_value=diff_result, ): - mock_instance = MagicMock() - mock_instance.exclude_irrelevant_files.return_value = [] - MockParser.return_value = mock_instance - RPGEvolution.process_diff( repo_name="test", repo_info="Test repo", diff --git a/CoderMind/tests/test_rpg_io.py b/CoderMind/tests/test_rpg_io.py index 2322867..6cf5580 100644 --- a/CoderMind/tests/test_rpg_io.py +++ b/CoderMind/tests/test_rpg_io.py @@ -108,6 +108,45 @@ def test_preserves_unicode(self, tmp_path: Path) -> None: loaded = json.loads(target.read_text(encoding="utf-8")) assert loaded["name"] == "测试 \u2014 ✓" + def test_forwards_dump_kwargs(self, tmp_path: Path) -> None: + """``**dump_kwargs`` is forwarded to ``json.dump`` so callers + can pass custom serialiser hooks such as ``default=``.""" + target = tmp_path / "rpg.json" + + class _NotSerialisable: + def to_dict(self): + return {"recovered": True} + + # Without ``default=`` this would raise TypeError; passing the + # legacy lambda the encoder used proves the kwarg reaches json.dump. + rpg_io.atomic_write_rpg( + target, + {"obj": _NotSerialisable()}, + default=lambda o: o.to_dict() if hasattr(o, "to_dict") else str(o), + ) + assert json.loads(target.read_text()) == {"obj": {"recovered": True}} + + def test_no_partial_file_on_serialise_failure(self, tmp_path: Path) -> None: + """A TypeError mid-``json.dump`` (no ``default=`` for an + unserialisable object) must leave the original file intact and + clean up the ``.tmp`` — the bug we kept hitting when the bench + killed cobra encode mid-write.""" + target = tmp_path / "rpg.json" + target.write_text('{"existing": "intact"}') + + class _Bad: + pass + + with pytest.raises(TypeError): + rpg_io.atomic_write_rpg(target, {"obj": _Bad()}) + + # Original survives because os.replace never ran. + assert json.loads(target.read_text()) == {"existing": "intact"} + # The .tmp file must be cleaned up so a re-run doesn't see stale + # crud from the failed attempt. + tmp = target.with_suffix(".json.tmp") + assert not tmp.exists() + # --------------------------------------------------------------------------- # safe_load_rpg — success path + propagation of FileNotFoundError diff --git a/CoderMind/tests/test_step3_polish.py b/CoderMind/tests/test_step3_polish.py index fd2af4a..215f264 100644 --- a/CoderMind/tests/test_step3_polish.py +++ b/CoderMind/tests/test_step3_polish.py @@ -13,9 +13,8 @@ ``head_timestamp`` even in **noop** mode (covers ``git checkout other_branch_at_same_sha`` and ``git branch -m`` cases). -D. ``_install_git_post_merge_hook`` installs an RPG sync hook in - ``post-merge`` so ``git pull`` / ``git merge`` keeps the graph - aligned with teammate-incoming code. +D. Git hook setup installs post-commit/post-merge dispatcher hooks and + removes CoderMind-owned pre-commit blocks. """ from __future__ import annotations @@ -135,27 +134,28 @@ def test_resolve_git_hooks_dir_empty_core_hooks_path_falls_back(tmp_path): assert resolved == repo / ".git" / "hooks" -def test_install_pre_commit_hook_via_core_hooks_path(tmp_path): - """End-to-end: when ``core.hooksPath`` is set, the installer must write into THAT directory, not ``.git/hooks``. This is the case where teams use husky / pre-commit / lefthook.""" +def test_uninstall_pre_commit_hook_via_core_hooks_path(tmp_path): + """``core.hooksPath`` directs pre-commit cleanup to the active hooks dir.""" repo = tmp_path / "repo" repo.mkdir() _sh(repo, "init", "-q", "-b", "main") custom_hooks = repo / ".husky" custom_hooks.mkdir() _sh(repo, "config", "core.hooksPath", str(custom_hooks)) + (custom_hooks / "pre-commit").write_text( + "#!/bin/sh\n" + "# CoderMind: full RPG sync on commit\n" + "cmind script update_graphs.py sync --staged-only\n" + ) - assert cmind_cli._install_git_pre_commit_hook(repo) is True + assert cmind_cli._uninstall_git_pre_commit_hook(repo) is True - # Hook landed in the custom dir, NOT in .git/hooks. - assert (custom_hooks / "pre-commit").is_file() + assert not (custom_hooks / "pre-commit").exists() assert not (repo / ".git" / "hooks" / "pre-commit").exists() - text = (custom_hooks / "pre-commit").read_text() - assert "CMIND-BEGIN pre-commit" in text - assert "--staged-only" in text -def test_install_pre_commit_hook_in_worktree(tmp_path): - """End-to-end: ``_install_git_pre_commit_hook`` must succeed for a worktree-style ``.git`` file (regression for the original bug where the installer did ``if not .git.is_dir(): return False``).""" +def test_uninstall_pre_commit_hook_in_worktree(tmp_path): + """Pre-commit cleanup works through a worktree-style ``.git`` file.""" main = tmp_path / "main" main.mkdir() _sh(main, "init", "-q", "-b", "main") @@ -166,12 +166,15 @@ def test_install_pre_commit_hook_in_worktree(tmp_path): _sh(main, "commit", "-q", "-m", "init") wt = tmp_path / "wt" _sh(main, "worktree", "add", "--detach", str(wt)) - - assert cmind_cli._install_git_pre_commit_hook(wt) is True - # Hook landed in the shared hooks dir (main repo) not the worktree pre_commit = main / ".git" / "hooks" / "pre-commit" - assert pre_commit.is_file() - assert "CoderMind: incremental RPG sync on commit" in pre_commit.read_text() + pre_commit.write_text( + "#!/bin/sh\n" + "# CoderMind: incremental RPG sync on commit\n" + "cmind script update_graphs.py sync --staged-only\n" + ) + + assert cmind_cli._uninstall_git_pre_commit_hook(wt) is True + assert not pre_commit.exists() # =========================================================================== @@ -348,12 +351,9 @@ def test_install_post_merge_hook_writes_script(tmp_path): post_merge = repo / ".git" / "hooks" / "post-merge" assert post_merge.is_file() content = post_merge.read_text() - assert "CoderMind: incremental RPG sync after merge / pull" in content - assert "update_graphs.py" in content and " sync " in content - # post-merge fires AFTER files are in the working tree, no staging - # area exists at that point — so the hook must NOT use --staged-only. + assert "CoderMind: post-merge dispatcher" in content + assert "cmind hook post-merge" in content assert "--staged-only" not in content - # Hook must be executable import stat assert post_merge.stat().st_mode & stat.S_IXUSR @@ -366,8 +366,7 @@ def test_install_post_merge_hook_is_idempotent(tmp_path): cmind_cli._install_git_post_merge_hook(repo) cmind_cli._install_git_post_merge_hook(repo) post_merge = (repo / ".git" / "hooks" / "post-merge").read_text() - # Marker appears exactly once - assert post_merge.count("CoderMind: incremental RPG sync after merge / pull") == 1 + assert post_merge.count("CoderMind: post-merge dispatcher") == 1 def test_install_post_merge_hook_preserves_existing_user_hook(tmp_path): @@ -383,11 +382,12 @@ def test_install_post_merge_hook_preserves_existing_user_hook(tmp_path): cmind_cli._install_git_post_merge_hook(repo) content = user_hook.read_text() assert "echo 'user custom hook'" in content - assert "CoderMind: incremental RPG sync after merge / pull" in content + assert "CoderMind: post-merge dispatcher" in content + assert "cmind hook post-merge" in content -def test_install_hooks_installs_both_pre_commit_and_post_merge(tmp_path): - """End-to-end: ``_install_hooks`` should produce all three hooks.""" +def test_install_hooks_installs_post_hooks_and_removes_pre_commit(tmp_path): + """End-to-end: ``_install_hooks`` writes post hooks and no pre-commit.""" project = tmp_path / "proj" project.mkdir() (project / ".cmind" / "scripts").mkdir(parents=True) @@ -400,19 +400,15 @@ def test_install_hooks_installs_both_pre_commit_and_post_merge(tmp_path): pre_commit = project / ".git" / "hooks" / "pre-commit" post_commit = project / ".git" / "hooks" / "post-commit" post_merge = project / ".git" / "hooks" / "post-merge" - assert pre_commit.is_file() + assert not pre_commit.exists() assert post_commit.is_file() assert post_merge.is_file() - # pre-commit uses --staged-only (only the index counts before commit - # is recorded). post-commit and post-merge do NOT — HEAD has moved - # by the time they fire, and there's no index to filter on anyway. - assert "--staged-only" in pre_commit.read_text() assert "--staged-only" not in post_commit.read_text() assert "--staged-only" not in post_merge.read_text() def test_install_post_commit_hook_writes_script(tmp_path): - """``post-commit`` exists to advance meta.git AFTER the new commit has been recorded (pre-commit fires too early — HEAD is still the previous commit, so meta.git would land 1 commit behind).""" + """``post-commit`` delegates to the Python hook dispatcher.""" repo = tmp_path / "repo" repo.mkdir() _sh(repo, "init", "-q") @@ -421,24 +417,16 @@ def test_install_post_commit_hook_writes_script(tmp_path): post_commit = repo / ".git" / "hooks" / "post-commit" assert post_commit.is_file() content = post_commit.read_text() - assert "CoderMind: advance meta.git + background feature graph update" in content - assert "update_graphs.py" in content and " sync " in content - assert "update-rpg" in content - # Must unset GIT_INDEX_FILE to avoid hook env var leaking into - # background worktree operations. - assert "GIT_INDEX_FILE" in content - # Detach via nohup (POSIX, portable to macOS). setsid was used - # previously but is util-linux-only and silently absent on macOS. - assert "nohup" in content + assert "CoderMind: post-commit dispatcher" in content + assert "cmind hook post-commit" in content + assert "update_graphs.py" not in content + assert "update-rpg" not in content + assert "GIT_INDEX_FILE" not in content + assert "nohup" not in content assert "setsid" not in content - # Atomic lock via mkdir (the only POSIX-atomic exclusive-create - # primitive available from shell). - assert "mkdir " in content - assert "rmdir " in content - # Stale-lock recovery for orphaned worker runs (>60min old). - assert "-mmin +60" in content - # Like post-merge, no --staged-only because the commit is already - # recorded and there's no useful index scope to filter. + assert "mkdir " not in content + assert "rmdir " not in content + assert "-mmin +60" not in content assert "--staged-only" not in content import stat assert post_commit.stat().st_mode & stat.S_IXUSR @@ -452,11 +440,11 @@ def test_install_post_commit_hook_is_idempotent(tmp_path): cmind_cli._install_git_post_commit_hook(repo) cmind_cli._install_git_post_commit_hook(repo) text = (repo / ".git" / "hooks" / "post-commit").read_text() - assert text.count("CoderMind: advance meta.git + background feature graph update") == 1 + assert text.count("CoderMind: post-commit dispatcher") == 1 def test_workspace_root_resolution_prefers_cwd_over_env(tmp_path, monkeypatch): - """Regression: hooks spawned by ``git`` always have cwd at the repo root. If a parent process previously set ``CMIND_WORKSPACE`` to a different workspace (e.g. the developer's CoderMind dev env), the inherited env var must NOT override the hook's actual workspace.""" + """Git hooks must prefer the repository cwd over inherited workspace env.""" # Set up two distinct workspaces real_ws = tmp_path / "real-ws" (real_ws / ".cmind").mkdir(parents=True) diff --git a/CoderMind/tests/test_step4_integration.py b/CoderMind/tests/test_step4_integration.py index c58c465..8b94eef 100644 --- a/CoderMind/tests/test_step4_integration.py +++ b/CoderMind/tests/test_step4_integration.py @@ -123,13 +123,13 @@ def test_refresh_dep_graph_safe_without_files_falls_back_to_full(codegen_workspa def test_refresh_dep_graph_safe_skips_non_py_files(codegen_workspace, caplog): - """If the batch only edited non-Python files, the function should short-circuit without scanning the AST.""" + """If the batch only edited non-source files, the function should short-circuit without scanning the AST.""" ws, code, _, _, _, run_batch = codegen_workspace import logging with caplog.at_level(logging.INFO, logger="run_batch"): run_batch._refresh_dep_graph_safe(code, changed_files=["README.md"]) assert any( - "no .py files" in record.getMessage() for record in caplog.records + "no supported source files" in record.getMessage() for record in caplog.records ), "expected the short-circuit log line" @@ -162,11 +162,15 @@ def __init__(self, task_type, file_path): # =========================================================================== -# 4b — _update_dep_graph_index persists dep_graph.json to disk +# 4b — _update_dep_graph_index attaches in-memory dep_graph for embedded save # =========================================================================== -def test_update_dep_graph_index_writes_dep_graph_json(tmp_path): - """Regression: the previous implementation called ``rpg.parse_dep_graph()`` which only mutated memory. After this fix, supplying ``save_path`` must produce ``dep_graph.json`` on disk with the freshly built graph.""" +def test_update_dep_graph_index_populates_in_memory_dep_graph(tmp_path): + """After the dep_graph-single-source migration, ``_update_dep_graph_index`` + no longer requires a standalone ``dep_graph.json`` write — the caller's + ``svc.save(rpg.json)`` embeds the in-memory graph via ``RPG.to_dict``. + The helper still mutates ``rpg.dep_graph`` so callers can serialise it. + """ from rpg_encoder.rpg_evolution import RPGEvolution import logging @@ -177,22 +181,26 @@ def test_update_dep_graph_index_writes_dep_graph_json(tmp_path): (repo / "y.py").write_text("from x import x\ndef y(): return x() * 2\n") rpg = RPG(repo_name="ws") - dep_graph_path = ws / "dep_graph.json" logger = logging.getLogger("test_4b") - RPGEvolution._update_dep_graph_index( - rpg, str(ws), logger, save_path=str(dep_graph_path), - ) + # ``save_path`` omitted: new default — dep_graph rides inside rpg.json. + RPGEvolution._update_dep_graph_index(rpg, str(ws), logger) - assert dep_graph_path.is_file(), "dep_graph.json must be written" - data = json.loads(dep_graph_path.read_text()) - # Sanity: the on-disk file must reflect what's in memory - assert "nodes" in data - assert len(data["nodes"]) == len(rpg.dep_graph.G.nodes) + assert rpg.dep_graph is not None, "in-memory dep_graph must be attached" + assert rpg.dep_graph.G.number_of_nodes() >= 2, ( + "dep_graph must contain at least the two source files" + ) + # Round-trip through to_dict to prove embedding works. + serialised = rpg.to_dict() + assert "dep_graph" in serialised + assert serialised["dep_graph"]["nodes"] -def test_update_dep_graph_index_save_path_outside_rpg_dir(tmp_path): - """Regression: when ``save_path`` lives outside the default ``RPGService._rpg_dir`` (which defaults to cwd), the relative-path computation used to raise ``ValueError`` and the dep_graph save silently aborted. After the fix, ``_update_dep_graph_index`` anchors the service's ``_rpg_dir`` to the save_path's parent so the persisted reference becomes a clean relative ``dep_graph.json`` and the file actually lands on disk.""" +def test_update_dep_graph_index_legacy_save_path_still_writes_standalone(tmp_path): + """Backward-compat: callers that still pass ``save_path`` get the + standalone ``dep_graph.json`` written (legacy path preserved for + tooling that consumed the sidecar file directly). + """ from rpg_encoder.rpg_evolution import RPGEvolution import logging @@ -205,7 +213,7 @@ def test_update_dep_graph_index_save_path_outside_rpg_dir(tmp_path): # Place the dep_graph in a deep tmpdir nobody's cwd ever traverses dep_graph_path = tmp_path / "elsewhere" / "dep_graph.json" - logger = logging.getLogger("test_4b_outside") + logger = logging.getLogger("test_4b_legacy") RPGEvolution._update_dep_graph_index( rpg, str(ws), logger, save_path=str(dep_graph_path), ) @@ -215,12 +223,15 @@ def test_update_dep_graph_index_save_path_outside_rpg_dir(tmp_path): assert dep_graph_path.is_file() # ``_dep_graph_file`` is stored relative to the save_path's parent # (which _update_dep_graph_index sets as _rpg_dir), so callers that - # ``RPGService.load`` the RPG later can still find it. + # ``RPGService.load`` the RPG later can still find the legacy file. assert rpg._dep_graph_file == "dep_graph.json" -def test_update_dep_graph_index_without_save_path_logs_warning(tmp_path, caplog): - """Legacy behaviour: when no save_path is provided the function still updates in-memory dep_graph but must warn so the user knows the standalone JSON is stale.""" +def test_update_dep_graph_index_without_save_path_logs_info(tmp_path, caplog): + """Default behaviour after the embed migration: no save_path means the + dep_graph is attached in memory and an INFO log records that it will + ride inside rpg.json on the caller's next save. + """ from rpg_encoder.rpg_evolution import RPGEvolution import logging @@ -230,21 +241,23 @@ def test_update_dep_graph_index_without_save_path_logs_warning(tmp_path, caplog) (repo / "z.py").write_text("z = 1\n") rpg = RPG(repo_name="ws") - logger = logging.getLogger("test_4b_warn") - logger.setLevel(logging.WARNING) - with caplog.at_level(logging.WARNING, logger=logger.name): + logger = logging.getLogger("test_4b_info") + logger.setLevel(logging.INFO) + with caplog.at_level(logging.INFO, logger=logger.name): RPGEvolution._update_dep_graph_index(rpg, str(ws), logger) - # Must surface the "may be stale" warning + # Must surface the embed-on-save INFO log assert any( - "may be stale" in record.getMessage() for record in caplog.records - ), "expected legacy-behaviour warning" + "embeds into rpg.json" in record.getMessage() + for record in caplog.records + ), "expected embed-on-save info log" -def test_process_diff_threads_dep_graph_save_path(tmp_path): - """End-to-end check that ``process_diff`` propagates ``dep_graph_save_path`` through to ``_update_dep_graph_index``. +def test_process_diff_embeds_dep_graph_into_rpg(tmp_path): + """End-to-end check that ``process_diff`` produces an rpg with an + embedded dep_graph that can be round-tripped via ``RPG.to_dict``. We stub the LLM-driven sub-processes (``_process_add_files`` etc.) - so the test stays fast and focuses on the dep_graph write. + so the test stays fast and focuses on the dep_graph attach. """ from rpg_encoder.rpg_evolution import RPGEvolution import logging @@ -260,7 +273,6 @@ def test_process_diff_threads_dep_graph_save_path(tmp_path): (cur / "k.py").write_text("k = 1\n") rpg = RPG(repo_name="ws") - dep_graph_path = tmp_path / "dep_graph.json" logger = logging.getLogger("test_process_diff") # Stub exclusion (it would call LLM otherwise) @@ -268,7 +280,7 @@ def test_process_diff_threads_dep_graph_save_path(tmp_path): "rpg_encoder.rpg_encoding.RPGParser.exclude_irrelevant_files", return_value=[], ): - RPGEvolution.process_diff( + updated = RPGEvolution.process_diff( repo_name="ws", repo_info="", save_path="", @@ -278,12 +290,15 @@ def test_process_diff_threads_dep_graph_save_path(tmp_path): last_feature_tree=[], logger=logger, update_dep_graph=True, - dep_graph_save_path=str(dep_graph_path), + # dep_graph_save_path omitted on purpose: new default. ) - assert dep_graph_path.is_file(), ( - "dep_graph.json must be written even when there are 'no changes'" + assert updated.dep_graph is not None, ( + "process_diff must attach an in-memory dep_graph for downstream save" ) + serialised = updated.to_dict() + assert "dep_graph" in serialised + assert serialised["dep_graph"]["nodes"] # =========================================================================== @@ -330,7 +345,7 @@ def update_rpg_workspace(tmp_path): def test_run_update_rpg_advances_meta_git_and_runs_align(update_rpg_workspace, monkeypatch): - """Even on the "no changes" branch, ``run_update_rpg`` must: * write dep_graph.json (4b) * advance meta.git to the current HEAD (4c) * run enrich(align_only=True) (4c).""" + """Even on the "no changes" branch, ``run_update_rpg`` must: * embed dep_graph into rpg.json (4b) * advance meta.git to the current HEAD (4c) * run enrich(align_only=True) (4c).""" ws, repo, rpg_path, dep_graph_path = update_rpg_workspace # WORKSPACE_ROOT is resolved at import time inside common.paths. @@ -371,10 +386,9 @@ def test_run_update_rpg_advances_meta_git_and_runs_align(update_rpg_workspace, m assert persisted["meta"]["git"]["head_commit"] == head assert persisted["meta"]["git"]["head_branch"] == "main" - # dep_graph.json exists and is non-empty - assert dep_graph_path.is_file() - dg = json.loads(dep_graph_path.read_text()) - assert len(dg["nodes"]) > 0 + # dep_graph is embedded in rpg.json (single source of truth) + assert "dep_graph" in persisted + assert persisted["dep_graph"]["nodes"] def test_run_update_rpg_dep_graph_path_default_matches_constant(monkeypatch, tmp_path): diff --git a/CoderMind/tests/test_storage.py b/CoderMind/tests/test_storage.py index 845b89c..1618fc5 100644 --- a/CoderMind/tests/test_storage.py +++ b/CoderMind/tests/test_storage.py @@ -331,8 +331,7 @@ def test_reset_resets_init_version( ) second = _storage.read_meta(workspace) assert second is not None - # init_version should track the *current* call now, not the - # previously-recorded one. + # init_version should track the current call. assert second["cmind_cli_version_at_init"] == "0.2.0" assert second["cmind_cli_version_last_seen"] == "0.2.0" diff --git a/CoderMind/tests/test_sync_from_commit_diff.py b/CoderMind/tests/test_sync_from_commit_diff.py index 9024078..023d18c 100644 --- a/CoderMind/tests/test_sync_from_commit_diff.py +++ b/CoderMind/tests/test_sync_from_commit_diff.py @@ -527,11 +527,9 @@ def test_cli_sync_force_full(synced_repo): def test_cli_sync_missing_rpg_returns_actionable_error(tmp_path): """``sync`` must early-return with a /cmind.encode hint when rpg.json is absent. - Regression guard: previously ``RPGService.load`` raised - ``FileNotFoundError`` which the post-commit hook silently swallowed - via ``|| true``. We now want a structured error visible in the - hook log so the user can tell *why* the background updater did - nothing. + Regression guard: missing RPG files should produce a structured + error visible in the hook log so the user can tell why the + background updater did nothing. """ script = _project_root / "scripts" / "update_graphs.py" missing = tmp_path / "does_not_exist.json"