diff --git a/.env.example b/.env.example index 0f3799e..5cf7e7f 100644 --- a/.env.example +++ b/.env.example @@ -45,7 +45,8 @@ MODEL_IDLE_TIMEOUT_SEC=180 # Runtime mode for optional Rust-backed provider/kernel paths. # off — default; use Python implementations. # required — selected Rust-backed paths must run and hard-fail on import/call errors. -# Currently selected paths: voiceprint scoring and result post-processing. +# Currently selected paths: voiceprint scoring, result post-processing, and +# artifact manifest helper contracts. # CI/Docker packaging still validates the Rust extension directly even when # the runtime default is off. RUST_KERNEL_MODE=off diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 2b4b849..233d182 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -2,7 +2,7 @@ name: Claude Code Review on: pull_request: - types: [opened, reopened, ready_for_review] + types: [opened, synchronize, reopened, ready_for_review] branches: [main] permissions: @@ -21,26 +21,17 @@ jobs: contains(fromJSON('["OWNER","MEMBER","COLLABORATOR"]'), github.event.pull_request.author_association) }} env: - HAS_ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY != '' }} - HAS_ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL != '' }} - HAS_CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN != '' }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} CLAUDE_MODEL: claude-sonnet-4-6 steps: - name: Skip when Claude secrets are not configured - if: >- - ${{ - env.HAS_CLAUDE_CODE_OAUTH_TOKEN != 'true' && - (env.HAS_ANTHROPIC_API_KEY != 'true' || env.HAS_ANTHROPIC_BASE_URL != 'true') - }} + if: ${{ env.ANTHROPIC_API_KEY == '' || env.ANTHROPIC_BASE_URL == '' }} run: echo "Claude Code review secrets are not configured; skipping Claude Code review." - name: Detect Claude review workflow changes id: claude-workflow-change - if: >- - ${{ - env.HAS_CLAUDE_CODE_OAUTH_TOKEN == 'true' || - (env.HAS_ANTHROPIC_API_KEY == 'true' && env.HAS_ANTHROPIC_BASE_URL == 'true') - }} + if: ${{ env.ANTHROPIC_API_KEY != '' && env.ANTHROPIC_BASE_URL != '' }} uses: actions/github-script@v8 with: script: | @@ -60,58 +51,14 @@ jobs: run: echo "Skipping Claude Code Review because this PR changes the review workflow itself." - name: Checkout repository - if: >- - ${{ - ( - env.HAS_CLAUDE_CODE_OAUTH_TOKEN == 'true' || - (env.HAS_ANTHROPIC_API_KEY == 'true' && env.HAS_ANTHROPIC_BASE_URL == 'true') - ) && - steps.claude-workflow-change.outputs.self_changed != 'true' - }} + if: ${{ env.ANTHROPIC_API_KEY != '' && env.ANTHROPIC_BASE_URL != '' && steps.claude-workflow-change.outputs.self_changed != 'true' }} uses: actions/checkout@v6 with: fetch-depth: 1 persist-credentials: false - - name: Run Claude Code review with OAuth - if: ${{ env.HAS_CLAUDE_CODE_OAUTH_TOKEN == 'true' && steps.claude-workflow-change.outputs.self_changed != 'true' }} - uses: anthropics/claude-code-action@v1 - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - track_progress: true - use_sticky_comment: true - exclude_comments_by_actor: MapleEve,github-actions,codecov,sourcery-ai,copilot-pull-request-reviewer - prompt: | - REPO: ${{ github.repository }} - PR NUMBER: ${{ github.event.pull_request.number }} - - Review this pull request using REVIEW.md as the review-only guide. - Focus on actionable VoScript risks: - - Privacy and security leaks - - Model lifecycle races and GPU/CPU fallback behavior - - HTTP API compatibility - - Regression-test coverage - - Synchronized English/Chinese documentation - - The PR branch is already checked out in the current working directory. - Post feedback only through the official Claude Code Action GitHub integration. - Do not use the GitHub CLI and do not use a user-owned GitHub token. - If the official Claude GitHub App integration is unavailable, fail instead of posting as the repository owner. - If there are no actionable findings, post the standard no-findings confirmation through the action integration. - Avoid formatting-only comments. - - claude_args: | - --model ${{ env.CLAUDE_MODEL }} - --max-turns 30 - - - name: Run Claude Code review with API key - if: >- - ${{ - env.HAS_CLAUDE_CODE_OAUTH_TOKEN != 'true' && - env.HAS_ANTHROPIC_API_KEY == 'true' && - env.HAS_ANTHROPIC_BASE_URL == 'true' && - steps.claude-workflow-change.outputs.self_changed != 'true' - }} + - name: Run Claude Code review + if: ${{ env.ANTHROPIC_API_KEY != '' && env.ANTHROPIC_BASE_URL != '' && steps.claude-workflow-change.outputs.self_changed != 'true' }} uses: anthropics/claude-code-action@v1 env: ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} diff --git a/.github/workflows/issue-autocomment.yml b/.github/workflows/issue-autocomment.yml index 9cf2230..8eb0a01 100644 --- a/.github/workflows/issue-autocomment.yml +++ b/.github/workflows/issue-autocomment.yml @@ -1,4 +1,5 @@ name: Issue Auto Comment + on: issues: types: @@ -16,58 +17,99 @@ permissions: jobs: run: permissions: - issues: write # for actions-cool/issues-helper to update issues - pull-requests: write # for actions-cool/issues-helper to update PRs + issues: write + pull-requests: write runs-on: ubuntu-latest steps: - - name: Auto Comment on Issues Opened - uses: wow-actions/auto-comment@v1 + - name: Comment and maintain labels + uses: actions/github-script@v8 with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} - issuesOpened: | - 👀 @{{ author }} + script: | + const { owner, repo } = context.repo; - Thank you for raising an issue. We will investigate into the matter and get back to you as soon as possible. - Please make sure you have given us as much context as possible.\ - 非常感谢您提交 issue。我们会尽快调查此事,并尽快回复您。 请确保您已经提供了尽可能多的背景信息。 - - name: Auto Comment on Issues Closed - uses: wow-actions/auto-comment@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} - issuesClosed: | - ✅ @{{ author }} + async function createComment(issueNumber, body) { + await github.rest.issues.createComment({ + owner, + repo, + issue_number: issueNumber, + body, + }); + } - This issue is closed, If you have any questions, you can comment and reply.\ - 此问题已经关闭。如果您有任何问题,可以留言并回复。 - - name: Auto Comment on Pull Request Opened - uses: wow-actions/auto-comment@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} - pullRequestOpened: | - 👍 @{{ author }} + if (context.eventName === "issues") { + const issue = context.payload.issue; + const actor = context.actor; + const author = issue.user.login; - Thank you for raising your pull request and contributing to voscript. - Please make sure you have followed our contributing guidelines. We will review it as soon as possible. - If you encounter any problems, please feel free to connect with us.\ - 非常感谢您提出拉取请求并为 voscript 做出贡献,请确保您已经遵循了我们的贡献指南,我们会尽快审查它。 - 如果您遇到任何问题,请随时与我们联系。 - - name: Auto Comment on Pull Request Merged - uses: actions-cool/pr-welcome@main - if: github.event.pull_request.merged == true - with: - token: ${{ secrets.GITHUB_TOKEN }} - comment: | - ❤️ Great PR @${{ github.event.pull_request.user.login }} ❤️ + if (context.payload.action === "opened") { + await createComment( + issue.number, + [ + `👀 @${author}`, + "", + "Thank you for raising an issue. We will investigate into the matter and get back to you as soon as possible.", + "Please make sure you have given us as much context as possible.", + "非常感谢您提交 issue。我们会尽快调查此事,并尽快回复您。请确保您已经提供了尽可能多的背景信息。", + ].join("\n"), + ); + } - The growth of project is inseparable from user feedback and contribution, thanks for your contribution! \ - 项目的成长离不开用户反馈和贡献,感谢您的贡献! - emoji: 'hooray' - pr-emoji: '+1, heart' - - name: Remove inactive - if: github.event.issue.state == 'open' && github.actor == github.event.issue.user.login - uses: actions-cool/issues-helper@v3 - with: - actions: 'remove-labels' - token: ${{ secrets.GITHUB_TOKEN }} - issue-number: ${{ github.event.issue.number }} - labels: 'Inactive' \ No newline at end of file + if (context.payload.action === "closed") { + await createComment( + issue.number, + [ + `✅ @${author}`, + "", + "This issue is closed. If you have any questions, you can comment and reply.", + "此问题已经关闭。如果您有任何问题,可以留言并回复。", + ].join("\n"), + ); + } + + if (issue.state === "open" && actor === author) { + try { + await github.rest.issues.removeLabel({ + owner, + repo, + issue_number: issue.number, + name: "Inactive", + }); + } catch (error) { + if (error.status !== 404) { + throw error; + } + } + } + } + + if (context.eventName === "pull_request_target") { + const pull = context.payload.pull_request; + const author = pull.user.login; + + if (context.payload.action === "opened") { + await createComment( + pull.number, + [ + `👍 @${author}`, + "", + "Thank you for raising your pull request and contributing to VoScript.", + "Please make sure you have followed our contributing guidelines. We will review it as soon as possible.", + "If you encounter any problems, please feel free to connect with us.", + "非常感谢您提出拉取请求并为 VoScript 做出贡献,请确保您已经遵循了我们的贡献指南,我们会尽快审查它。", + "如果您遇到任何问题,请随时与我们联系。", + ].join("\n"), + ); + } + + if (context.payload.action === "closed" && pull.merged) { + await createComment( + pull.number, + [ + `❤️ Great PR @${author} ❤️`, + "", + "The growth of the project is inseparable from user feedback and contribution. Thanks for your contribution!", + "项目的成长离不开用户反馈和贡献,感谢您的贡献!", + ].join("\n"), + ); + } + } diff --git a/.github/workflows/rust-foundation-heavy.yml b/.github/workflows/rust-foundation-heavy.yml index 036e736..f8f1c40 100644 --- a/.github/workflows/rust-foundation-heavy.yml +++ b/.github/workflows/rust-foundation-heavy.yml @@ -109,7 +109,7 @@ jobs: docker run --rm \ -e RUST_KERNEL_MODE=required \ voscript-rust-foundation:${{ github.sha }} \ - python -c "from providers.kernel_bridge import core_smoke, postprocess_segments, voiceprint_score; result = core_smoke({'source': 'ci'}); assert result['ok'] is True; assert result['echoed']['source'] == 'ci'; decision = voiceprint_score({'query_embedding': [1.0, 0.0], 'candidates': [{'speaker_id': 'spk_ci', 'name': 'CI', 'embedding': [1.0, 0.0], 'sample_count': 1, 'sample_spread': None}], 'threshold': 0.75, 'asnorm_threshold': 0.5, 'cohort': None}); assert decision['matched_id'] == 'spk_ci'; assert decision['reason'] == 'matched'; processed = postprocess_segments({'aligned_segments': [{'start': 0.0, 'end': 1.0, 'text': 'hello', 'speaker': 'SPEAKER_00'}], 'speaker_map': {}}); assert processed['segments'][0]['speaker_label'] == 'SPEAKER_00'; assert processed['unique_speakers'] == ['SPEAKER_00']" + python -c "from providers.kernel_bridge import artifact_manifest_contract, core_smoke, postprocess_segments, status_payload_contract, voiceprint_score; result = core_smoke({'source': 'ci'}); assert result['ok'] is True; assert result['echoed']['source'] == 'ci'; decision = voiceprint_score({'query_embedding': [1.0, 0.0], 'candidates': [{'speaker_id': 'spk_ci', 'name': 'CI', 'embedding': [1.0, 0.0], 'sample_count': 1, 'sample_spread': None}], 'threshold': 0.75, 'asnorm_threshold': 0.5, 'cohort': None}); assert decision['matched_id'] == 'spk_ci'; assert decision['reason'] == 'matched'; processed = postprocess_segments({'aligned_segments': [{'start': 0.0, 'end': 1.0, 'text': 'hello', 'speaker': 'SPEAKER_00'}], 'speaker_map': {}}); assert processed['segments'][0]['speaker_label'] == 'SPEAKER_00'; assert processed['unique_speakers'] == ['SPEAKER_00']; manifest = artifact_manifest_contract({'manifest_version': 'artifact_manifest.v1', 'stable': [{'name': 'result', 'filename': 'result.json', 'role': 'primary_result', 'media_type': 'application/json', 'required_for_result': True}], 'optional': [], 'experimental': []}); assert manifest['stable'][0]['filename'] == 'result.json'; status = status_payload_contract({'status': 'queued', 'updated_at': '2026-06-09T00:00:00+00:00', 'filename': 'private/audio.wav'}); assert status['status'] == 'queued'; assert status['filename'] == 'audio.wav'" - name: Run health check smoke run: | diff --git a/Cargo.lock b/Cargo.lock index 001a789..8b86cf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -127,7 +127,7 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "voscript_core" -version = "0.8.2" +version = "0.8.3" dependencies = [ "pyo3", ] diff --git a/app/api/routers/transcriptions.py b/app/api/routers/transcriptions.py index 128a2ad..a8378a8 100644 --- a/app/api/routers/transcriptions.py +++ b/app/api/routers/transcriptions.py @@ -35,6 +35,7 @@ ) from infra.job_persistence import _atomic_write_json, _write_status from infra.job_runtime import jobs, register_in_flight, unregister_in_flight +from pipeline.contracts import normalize_status_payload _SPK_ID_RE = re.compile(r"^spk_[A-Za-z0-9_-]{1,64}$") @@ -247,7 +248,7 @@ async def get_job( if status_path.exists(): try: - status_data = json.loads(status_path.read_text()) + status_data = normalize_status_payload(json.loads(status_path.read_text())) except Exception: raise HTTPException(404, "Job not found") diff --git a/app/infra/job_persistence.py b/app/infra/job_persistence.py index ea7ea9a..c1b8136 100644 --- a/app/infra/job_persistence.py +++ b/app/infra/job_persistence.py @@ -8,6 +8,11 @@ from pathlib import Path from config import TRANSCRIPTIONS_DIR +from pipeline.contracts import ( + TERMINAL_JOB_STATUSES, + build_status_payload, + normalize_status_payload, +) logger = logging.getLogger(__name__) @@ -49,13 +54,12 @@ def _write_status( """ status_path = TRANSCRIPTIONS_DIR / job_id / "status.json" try: - payload = { - "status": status, - "updated_at": datetime.now(tz=timezone.utc).isoformat(), - "error": error, - } - if filename is not None: - payload["filename"] = filename + payload = build_status_payload( + status, + error=error, + filename=filename, + updated_at=datetime.now(tz=timezone.utc).isoformat(), + ) _atomic_write_json(status_path, payload) return True except Exception as exc: @@ -68,11 +72,14 @@ def recover_orphan_jobs() -> None: try: for status_path in TRANSCRIPTIONS_DIR.glob("*/status.json"): try: - data = json.loads(status_path.read_text()) - if data.get("status") not in ("completed", "failed"): - data["status"] = "failed" - data["error"] = "Process restarted while job was in progress" - data["updated_at"] = datetime.now(tz=timezone.utc).isoformat() + data = normalize_status_payload(json.loads(status_path.read_text())) + if data.get("status") not in TERMINAL_JOB_STATUSES: + data = build_status_payload( + "failed", + error="Process restarted while job was in progress", + filename=data.get("filename"), + updated_at=datetime.now(tz=timezone.utc).isoformat(), + ) _atomic_write_json(status_path, data) logger.info( "AR-C2: marked orphan job %s as failed", diff --git a/app/pipeline/contracts/__init__.py b/app/pipeline/contracts/__init__.py index 591afa5..3c08cb0 100644 --- a/app/pipeline/contracts/__init__.py +++ b/app/pipeline/contracts/__init__.py @@ -2,6 +2,7 @@ from .asr import ASRProvider, ASRRequest, ASRResult from .artifacts import ( + ARTIFACT_MANIFEST_CATEGORIES, ARTIFACT_MANIFEST_VERSION, AsyncUploadReader, AudioArtifactIndex, @@ -12,6 +13,8 @@ TranscriptionArtifactWriteRequest, UploadPersistenceRequest, build_artifact_manifest, + empty_artifact_manifest, + normalize_artifact_manifest, ) from .context import PipelineContext from .diarization import ( @@ -41,6 +44,27 @@ ) from .requests import PipelineRequest from .results import PipelineResult +from .schema import ( + OPTIONAL_FIRST_SCHEMA_POLICY, + SCHEMA_VERSION_KEY, + attach_optional_schema_version, + read_optional_schema_version, +) +from .status import ( + IN_PROGRESS_JOB_STATUSES, + JOB_STATUS_COMPLETED, + JOB_STATUS_CONVERTING, + JOB_STATUS_DENOISING, + JOB_STATUS_FAILED, + JOB_STATUS_IDENTIFYING, + JOB_STATUS_QUEUED, + JOB_STATUS_TRANSCRIBING, + KNOWN_JOB_STATUSES, + TERMINAL_JOB_STATUSES, + build_status_payload, + normalize_job_status, + normalize_status_payload, +) from .voiceprint_match import ( VoiceprintMatchProvider, VoiceprintMatchRequest, @@ -51,6 +75,7 @@ "ASRProvider", "ASRRequest", "ASRResult", + "ARTIFACT_MANIFEST_CATEGORIES", "ARTIFACT_MANIFEST_VERSION", "AsyncUploadReader", "AudioArtifactIndex", @@ -64,6 +89,16 @@ "DiarizationRequest", "DiarizationResult", "InputNormalizationProvider", + "IN_PROGRESS_JOB_STATUSES", + "JOB_STATUS_COMPLETED", + "JOB_STATUS_CONVERTING", + "JOB_STATUS_DENOISING", + "JOB_STATUS_FAILED", + "JOB_STATUS_IDENTIFYING", + "JOB_STATUS_QUEUED", + "JOB_STATUS_TRANSCRIBING", + "KNOWN_JOB_STATUSES", + "OPTIONAL_FIRST_SCHEMA_POLICY", "PersistedTranscriptionArtifacts", "PipelineContext", "PipelineLookupError", @@ -74,12 +109,21 @@ "SpeakerEmbeddingRequest", "SpeakerEmbeddingResult", "ProviderNotFoundError", + "SCHEMA_VERSION_KEY", "StageNotFoundError", + "TERMINAL_JOB_STATUSES", "TranscriptionArtifactStore", "TranscriptionArtifactWriteRequest", "UploadPersistenceRequest", "VoiceprintMatchProvider", "VoiceprintMatchRequest", "VoiceprintMatchResult", + "attach_optional_schema_version", "build_artifact_manifest", + "build_status_payload", + "empty_artifact_manifest", + "normalize_artifact_manifest", + "normalize_job_status", + "normalize_status_payload", + "read_optional_schema_version", ] diff --git a/app/pipeline/contracts/artifacts.py b/app/pipeline/contracts/artifacts.py index 369c6d3..df502d4 100644 --- a/app/pipeline/contracts/artifacts.py +++ b/app/pipeline/contracts/artifacts.py @@ -2,11 +2,33 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass from pathlib import Path +from pathlib import PurePosixPath +import re from typing import Any, Protocol, runtime_checkable ARTIFACT_MANIFEST_VERSION = "artifact_manifest.v1" +ARTIFACT_MANIFEST_CATEGORIES = ("stable", "optional", "experimental") + +_CONTROL_RE = re.compile(r"[\x00-\x1f\x7f]+") +_UNSAFE_FILENAME_RE = re.compile(r"[\\/]|://") + + +def _public_safe_text(value: Any, field_name: str) -> str: + text = _CONTROL_RE.sub(" ", str(value or "")).strip() + if not text: + raise ValueError(f"artifact manifest {field_name} must not be empty") + return text + + +def _public_safe_filename(value: Any) -> str: + text = _public_safe_text(value, "filename") + # Filenames in the manifest are artifact names only, never host-local paths. + if _UNSAFE_FILENAME_RE.search(text) or PurePosixPath(text).name != text: + raise ValueError("artifact manifest filename must not include a path or URL") + return text class AsyncUploadReader(Protocol): @@ -81,19 +103,83 @@ class ArtifactManifestEntry: required_for_result: bool = False speaker_label: str | None = None + @classmethod + def from_mapping(cls, payload: Mapping[str, Any]) -> "ArtifactManifestEntry": + """Build an entry from a mapping while ignoring unknown fields.""" + + return cls( + name=_public_safe_text(payload.get("name"), "name"), + filename=_public_safe_filename(payload.get("filename")), + role=_public_safe_text(payload.get("role"), "role"), + media_type=_public_safe_text(payload.get("media_type"), "media_type"), + required_for_result=bool(payload.get("required_for_result", False)), + speaker_label=( + _public_safe_text(payload.get("speaker_label"), "speaker_label") + if payload.get("speaker_label") is not None + else None + ), + ) + def as_dict(self) -> dict[str, Any]: payload: dict[str, Any] = { - "name": self.name, - "filename": self.filename, - "role": self.role, - "media_type": self.media_type, + "name": _public_safe_text(self.name, "name"), + "filename": _public_safe_filename(self.filename), + "role": _public_safe_text(self.role, "role"), + "media_type": _public_safe_text(self.media_type, "media_type"), "required_for_result": self.required_for_result, } if self.speaker_label is not None: - payload["speaker_label"] = self.speaker_label + payload["speaker_label"] = _public_safe_text( + self.speaker_label, "speaker_label" + ) return payload +def empty_artifact_manifest() -> dict[str, Any]: + """Return a compatible empty manifest for legacy results.""" + + return { + "manifest_version": ARTIFACT_MANIFEST_VERSION, + "stable": [], + "optional": [], + "experimental": [], + } + + +def normalize_artifact_manifest(manifest: Mapping[str, Any] | None) -> dict[str, Any]: + """Normalize a stored optional manifest without requiring clients to read it. + + Missing, malformed, or forward-compatible unknown entries are tolerated for + reads. Known entry fields are copied only after public-safe normalization, + so local paths, host details, tokens, job ids, or speaker ids from unknown + fields cannot leak through this helper. + """ + + if not isinstance(manifest, Mapping): + return empty_artifact_manifest() + + normalized = empty_artifact_manifest() + version = manifest.get("manifest_version") + if isinstance(version, str) and version.strip(): + normalized["manifest_version"] = _public_safe_text(version, "manifest_version") + + for category in ARTIFACT_MANIFEST_CATEGORIES: + entries = manifest.get(category, []) + if not isinstance(entries, list): + continue + for raw_entry in entries: + if not isinstance(raw_entry, Mapping): + continue + try: + normalized[category].append( + ArtifactManifestEntry.from_mapping(raw_entry).as_dict() + ) + except ValueError: + # Legacy/forward-compatible reads tolerate bad optional entries. + continue + return normalized + + def build_artifact_manifest( stable: list[ArtifactManifestEntry], optional: list[ArtifactManifestEntry] | None = None, @@ -101,12 +187,14 @@ def build_artifact_manifest( ) -> dict[str, Any]: """Build the optional artifact manifest for a completed transcription.""" - return { - "manifest_version": ARTIFACT_MANIFEST_VERSION, - "stable": [entry.as_dict() for entry in stable], - "optional": [entry.as_dict() for entry in optional or []], - "experimental": [entry.as_dict() for entry in experimental or []], - } + return normalize_artifact_manifest( + { + "manifest_version": ARTIFACT_MANIFEST_VERSION, + "stable": [entry.as_dict() for entry in stable], + "optional": [entry.as_dict() for entry in optional or []], + "experimental": [entry.as_dict() for entry in experimental or []], + } + ) @runtime_checkable @@ -120,6 +208,7 @@ def persist_transcription( __all__ = [ "ARTIFACT_MANIFEST_VERSION", + "ARTIFACT_MANIFEST_CATEGORIES", "AsyncUploadReader", "AudioArtifactIndex", "ArtifactManifestEntry", @@ -129,4 +218,6 @@ def persist_transcription( "TranscriptionArtifactWriteRequest", "UploadPersistenceRequest", "build_artifact_manifest", + "empty_artifact_manifest", + "normalize_artifact_manifest", ] diff --git a/app/pipeline/contracts/schema.py b/app/pipeline/contracts/schema.py new file mode 100644 index 0000000..462f7b7 --- /dev/null +++ b/app/pipeline/contracts/schema.py @@ -0,0 +1,47 @@ +"""Optional-first schema-version helpers for result/status artifacts.""" + +from __future__ import annotations + +from collections.abc import Mapping +import re +from typing import Any + +SCHEMA_VERSION_KEY = "schema_version" +OPTIONAL_FIRST_SCHEMA_POLICY = "optional_first" + +_VERSION_RE = re.compile(r"^[A-Za-z0-9_.-]{1,64}$") + + +def read_optional_schema_version(payload: Mapping[str, Any] | None) -> str | None: + """Read an optional schema_version without requiring legacy artifacts to set it.""" + + if not isinstance(payload, Mapping): + return None + value = payload.get(SCHEMA_VERSION_KEY) + if value is None: + return None + if not isinstance(value, str) or not _VERSION_RE.fullmatch(value): + raise ValueError("schema_version must be a short public-safe string") + return value + + +def attach_optional_schema_version( + payload: Mapping[str, Any], + schema_version: str | None, +) -> dict[str, Any]: + """Return a copy with schema_version only when a stable version is needed.""" + + result = dict(payload) + if schema_version is not None: + if not _VERSION_RE.fullmatch(schema_version): + raise ValueError("schema_version must be a short public-safe string") + result[SCHEMA_VERSION_KEY] = schema_version + return result + + +__all__ = [ + "OPTIONAL_FIRST_SCHEMA_POLICY", + "SCHEMA_VERSION_KEY", + "attach_optional_schema_version", + "read_optional_schema_version", +] diff --git a/app/pipeline/contracts/status.py b/app/pipeline/contracts/status.py new file mode 100644 index 0000000..606b12a --- /dev/null +++ b/app/pipeline/contracts/status.py @@ -0,0 +1,118 @@ +"""Stable contract helpers for persisted job status payloads.""" + +from __future__ import annotations + +from collections.abc import Mapping +from datetime import datetime, timezone +from pathlib import PurePosixPath +import re +from typing import Any + +JOB_STATUS_QUEUED = "queued" +JOB_STATUS_CONVERTING = "converting" +JOB_STATUS_DENOISING = "denoising" +JOB_STATUS_TRANSCRIBING = "transcribing" +JOB_STATUS_IDENTIFYING = "identifying" +JOB_STATUS_COMPLETED = "completed" +JOB_STATUS_FAILED = "failed" + +IN_PROGRESS_JOB_STATUSES = frozenset( + { + JOB_STATUS_QUEUED, + JOB_STATUS_CONVERTING, + JOB_STATUS_DENOISING, + JOB_STATUS_TRANSCRIBING, + JOB_STATUS_IDENTIFYING, + } +) +TERMINAL_JOB_STATUSES = frozenset({JOB_STATUS_COMPLETED, JOB_STATUS_FAILED}) +KNOWN_JOB_STATUSES = IN_PROGRESS_JOB_STATUSES | TERMINAL_JOB_STATUSES + +_CONTROL_RE = re.compile(r"[\x00-\x1f\x7f]+") + + +def _utc_now_iso() -> str: + return datetime.now(tz=timezone.utc).isoformat() + + +def _public_safe_text(value: Any) -> str: + return _CONTROL_RE.sub(" ", str(value or "")).strip() + + +def _public_safe_filename(value: Any) -> str | None: + if value is None: + return None + text = _public_safe_text(value).replace("\\", "/") + filename = PurePosixPath(text).name + return filename or None + + +def normalize_job_status(status: Any, *, default: str = JOB_STATUS_FAILED) -> str: + """Return a known status, defaulting invalid legacy values to failed.""" + + value = _public_safe_text(status).lower() + if value in KNOWN_JOB_STATUSES: + return value + return default + + +def build_status_payload( + status: str, + *, + error: Any = None, + filename: Any = None, + updated_at: str | None = None, +) -> dict[str, Any]: + """Build the persisted status.json payload without changing API shape.""" + + normalized_status = normalize_job_status(status, default="") + if not normalized_status: + raise ValueError(f"unknown job status: {status!r}") + + payload: dict[str, Any] = { + "status": normalized_status, + "updated_at": updated_at or _utc_now_iso(), + "error": None if error is None else _public_safe_text(error), + } + safe_filename = _public_safe_filename(filename) + if safe_filename is not None: + payload["filename"] = safe_filename + return payload + + +def normalize_status_payload(payload: Mapping[str, Any] | None) -> dict[str, Any]: + """Normalize legacy status.json payloads for restart/recovery reads.""" + + if not isinstance(payload, Mapping): + return build_status_payload( + JOB_STATUS_FAILED, + error="Invalid persisted job status", + ) + + updated_at = payload.get("updated_at") + if not isinstance(updated_at, str) or not updated_at.strip(): + updated_at = _utc_now_iso() + + return build_status_payload( + normalize_job_status(payload.get("status")), + error=payload.get("error"), + filename=payload.get("filename"), + updated_at=updated_at, + ) + + +__all__ = [ + "IN_PROGRESS_JOB_STATUSES", + "JOB_STATUS_COMPLETED", + "JOB_STATUS_CONVERTING", + "JOB_STATUS_DENOISING", + "JOB_STATUS_FAILED", + "JOB_STATUS_IDENTIFYING", + "JOB_STATUS_QUEUED", + "JOB_STATUS_TRANSCRIBING", + "KNOWN_JOB_STATUSES", + "TERMINAL_JOB_STATUSES", + "build_status_payload", + "normalize_job_status", + "normalize_status_payload", +] diff --git a/app/providers/artifacts/default.py b/app/providers/artifacts/default.py index b3a55c0..f02edf1 100644 --- a/app/providers/artifacts/default.py +++ b/app/providers/artifacts/default.py @@ -9,7 +9,11 @@ from infra.audio.paths import safe_speaker_label from infra.transcription_artifacts import persist_transcription_artifacts from postprocess.segments import build_display_names, build_result_segments -from providers.kernel_bridge import postprocess_segments, rust_provider_paths_enabled +from providers.kernel_bridge import ( + artifact_manifest_contract, + postprocess_segments, + rust_provider_paths_enabled, +) from pipeline.contracts import ( ArtifactManifestEntry, PipelineContext, @@ -116,7 +120,10 @@ def _build_artifact_manifest(speaker_labels: list[str]) -> dict: ) for speaker_label in speaker_labels ) - return build_artifact_manifest(stable=stable) + manifest = build_artifact_manifest(stable=stable) + if rust_provider_paths_enabled(): + return artifact_manifest_contract(manifest) + return manifest def build(self, context: PipelineContext) -> PipelineResult: transcription = self._build_transcription(context) diff --git a/app/providers/kernel_bridge/__init__.py b/app/providers/kernel_bridge/__init__.py index e6c134c..515c2e6 100644 --- a/app/providers/kernel_bridge/__init__.py +++ b/app/providers/kernel_bridge/__init__.py @@ -4,11 +4,13 @@ RUST_KERNEL_MODE_OFF, RUST_KERNEL_MODE_REQUIRED, RustKernelBridgeError, + artifact_manifest_contract, core_smoke, postprocess_segments, require_rust_core, rust_kernel_mode, rust_provider_paths_enabled, + status_payload_contract, voiceprint_score, ) @@ -16,10 +18,12 @@ "RUST_KERNEL_MODE_OFF", "RUST_KERNEL_MODE_REQUIRED", "RustKernelBridgeError", + "artifact_manifest_contract", "core_smoke", "postprocess_segments", "require_rust_core", "rust_kernel_mode", "rust_provider_paths_enabled", + "status_payload_contract", "voiceprint_score", ] diff --git a/app/providers/kernel_bridge/runtime.py b/app/providers/kernel_bridge/runtime.py index 36ba553..6c65820 100644 --- a/app/providers/kernel_bridge/runtime.py +++ b/app/providers/kernel_bridge/runtime.py @@ -206,6 +206,111 @@ def _validate_postprocess_segments_response(response: Any) -> dict[str, Any]: return result +def _validate_artifact_manifest_contract_response(response: Any) -> dict[str, Any]: + if not isinstance(response, Mapping): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract returned a non-mapping response" + ) + + result = dict(response) + required_keys = {"manifest_version", "stable", "optional", "experimental"} + missing = sorted(required_keys.difference(result)) + if missing: + raise RustKernelBridgeError( + "Rust artifact_manifest_contract response missing keys: " + + ", ".join(missing) + ) + if ( + not isinstance(result["manifest_version"], str) + or not result["manifest_version"] + ): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract manifest_version must be non-empty" + ) + for category in ("stable", "optional", "experimental"): + if not isinstance(result[category], list): + raise RustKernelBridgeError( + f"Rust artifact_manifest_contract {category} must be a list" + ) + result[category] = [ + _validate_artifact_manifest_entry_response(entry) + for entry in result[category] + ] + return result + + +def _validate_artifact_manifest_entry_response(entry: Any) -> dict[str, Any]: + if not isinstance(entry, Mapping): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract entry returned a non-mapping response" + ) + result = dict(entry) + required_keys = { + "name", + "filename", + "role", + "media_type", + "required_for_result", + } + missing = sorted(required_keys.difference(result)) + if missing: + raise RustKernelBridgeError( + "Rust artifact_manifest_contract entry missing keys: " + ", ".join(missing) + ) + for key in ("name", "filename", "role", "media_type"): + if not isinstance(result[key], str) or not result[key]: + raise RustKernelBridgeError( + f"Rust artifact_manifest_contract entry {key} must be non-empty" + ) + if ( + "/" in result["filename"] + or "\\" in result["filename"] + or "://" in result["filename"] + ): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract filename must not expose a path" + ) + if not isinstance(result["required_for_result"], bool): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract required_for_result must be bool" + ) + if "speaker_label" in result and not isinstance(result["speaker_label"], str): + raise RustKernelBridgeError( + "Rust artifact_manifest_contract speaker_label must be string" + ) + return result + + +def _validate_status_payload_contract_response(response: Any) -> dict[str, Any]: + if not isinstance(response, Mapping): + raise RustKernelBridgeError( + "Rust status_payload_contract returned a non-mapping response" + ) + + result = dict(response) + required_keys = {"status", "updated_at", "error"} + missing = sorted(required_keys.difference(result)) + if missing: + raise RustKernelBridgeError( + "Rust status_payload_contract response missing keys: " + ", ".join(missing) + ) + if not isinstance(result["status"], str) or not result["status"]: + raise RustKernelBridgeError("Rust status_payload_contract status is invalid") + if not isinstance(result["updated_at"], str) or not result["updated_at"]: + raise RustKernelBridgeError( + "Rust status_payload_contract updated_at is invalid" + ) + if result["error"] is not None and not isinstance(result["error"], str): + raise RustKernelBridgeError( + "Rust status_payload_contract error must be string or null" + ) + if "filename" in result and not isinstance(result["filename"], str): + raise RustKernelBridgeError( + "Rust status_payload_contract filename must be string" + ) + return result + + def _validate_postprocess_segment_response(segment: Any) -> dict[str, Any]: if not isinstance(segment, Mapping): raise RustKernelBridgeError( @@ -317,6 +422,36 @@ def core_smoke( return _validate_smoke_response(response) +def artifact_manifest_contract( + payload: dict[str, Any], + importer: Callable[[str], ModuleType] = import_module, +) -> dict[str, Any]: + """Call the native artifact manifest helper contract.""" + + rust_core = require_rust_core(importer=importer) + try: + response = rust_core.artifact_manifest_contract(payload) + except Exception as exc: + raise RustKernelBridgeError( + "Rust artifact_manifest_contract call failed" + ) from exc + return _validate_artifact_manifest_contract_response(response) + + +def status_payload_contract( + payload: dict[str, Any], + importer: Callable[[str], ModuleType] = import_module, +) -> dict[str, Any]: + """Call the native status payload helper contract.""" + + rust_core = require_rust_core(importer=importer) + try: + response = rust_core.status_payload_contract(payload) + except Exception as exc: + raise RustKernelBridgeError("Rust status_payload_contract call failed") from exc + return _validate_status_payload_contract_response(response) + + def voiceprint_score( payload: dict[str, Any], importer: Callable[[str], ModuleType] = import_module, diff --git a/crates/voscript_core/Cargo.toml b/crates/voscript_core/Cargo.toml index 883d2c9..dacdac6 100644 --- a/crates/voscript_core/Cargo.toml +++ b/crates/voscript_core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "voscript_core" -version = "0.8.2" +version = "0.8.3" edition = "2021" license = "Apache-2.0" publish = false diff --git a/crates/voscript_core/src/contracts.rs b/crates/voscript_core/src/contracts.rs new file mode 100644 index 0000000..d21942c --- /dev/null +++ b/crates/voscript_core/src/contracts.rs @@ -0,0 +1,115 @@ +pub const ARTIFACT_MANIFEST_VERSION: &str = "artifact_manifest.v1"; +pub const ARTIFACT_MANIFEST_CATEGORIES: [&str; 3] = ["stable", "optional", "experimental"]; + +pub const JOB_STATUS_QUEUED: &str = "queued"; +pub const JOB_STATUS_CONVERTING: &str = "converting"; +pub const JOB_STATUS_DENOISING: &str = "denoising"; +pub const JOB_STATUS_TRANSCRIBING: &str = "transcribing"; +pub const JOB_STATUS_IDENTIFYING: &str = "identifying"; +pub const JOB_STATUS_COMPLETED: &str = "completed"; +pub const JOB_STATUS_FAILED: &str = "failed"; + +#[derive(Clone, Debug, PartialEq)] +pub struct ArtifactManifestEntry { + pub name: String, + pub filename: String, + pub role: String, + pub media_type: String, + pub required_for_result: bool, + pub speaker_label: Option, +} + +fn strip_control_chars(value: &str) -> String { + value + .chars() + .map(|ch| if ch.is_control() { ' ' } else { ch }) + .collect::() + .trim() + .to_string() +} + +pub fn public_safe_text(value: &str, field_name: &str) -> Result { + let text = strip_control_chars(value); + if text.is_empty() { + return Err(format!("artifact manifest {field_name} must not be empty")); + } + Ok(text) +} + +pub fn public_safe_manifest_filename(value: &str) -> Result { + let filename = public_safe_text(value, "filename")?; + if filename.contains('/') + || filename.contains('\\') + || filename.contains("://") + || filename == "." + || filename == ".." + { + return Err("artifact manifest filename must not include a path or URL".to_string()); + } + Ok(filename) +} + +pub fn public_safe_status_filename(value: &str) -> Option { + let normalized = strip_control_chars(&value.replace('\\', "/")); + let filename = normalized.rsplit('/').next().unwrap_or("").trim(); + if filename.is_empty() { + None + } else { + Some(filename.to_string()) + } +} + +pub fn is_known_job_status(status: &str) -> bool { + matches!( + status, + JOB_STATUS_QUEUED + | JOB_STATUS_CONVERTING + | JOB_STATUS_DENOISING + | JOB_STATUS_TRANSCRIBING + | JOB_STATUS_IDENTIFYING + | JOB_STATUS_COMPLETED + | JOB_STATUS_FAILED + ) +} + +pub fn normalize_job_status(status: &str) -> String { + let value = strip_control_chars(status).to_ascii_lowercase(); + if is_known_job_status(value.as_str()) { + value + } else { + JOB_STATUS_FAILED.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn manifest_filename_rejects_paths_and_urls() { + assert!(public_safe_manifest_filename("result.json").is_ok()); + assert!(public_safe_manifest_filename("../result.json").is_err()); + assert!(public_safe_manifest_filename("http://host/result.json").is_err()); + } + + #[test] + fn status_filename_keeps_only_basename() { + assert_eq!( + public_safe_status_filename("private/audio.wav").as_deref(), + Some("audio.wav") + ); + assert_eq!( + public_safe_status_filename("C:\\tmp\\audio.wav").as_deref(), + Some("audio.wav") + ); + } + + #[test] + fn job_status_normalizes_unknown_to_failed() { + assert_eq!( + normalize_job_status("TRANSCribing"), + JOB_STATUS_TRANSCRIBING + ); + assert_eq!(normalize_job_status("mystery"), JOB_STATUS_FAILED); + } +} diff --git a/crates/voscript_core/src/lib.rs b/crates/voscript_core/src/lib.rs index ad1bebe..0bb0be0 100644 --- a/crates/voscript_core/src/lib.rs +++ b/crates/voscript_core/src/lib.rs @@ -5,6 +5,7 @@ use pyo3::prelude::*; #[cfg(feature = "python-bindings")] use pyo3::types::{PyDict, PyList, PyModule}; +pub mod contracts; pub mod postprocess; pub mod voiceprint; @@ -60,6 +61,14 @@ fn optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult, key: &str, default: bool) -> PyResult { + match dict.get_item(key)? { + Some(value) if !value.is_none() => value.extract::(), + _ => Ok(default), + } +} + #[cfg(feature = "python-bindings")] fn item_f64_or_default(item: Option>, default: f64) -> PyResult { match item { @@ -79,6 +88,90 @@ fn optional_f64_or_default(dict: &Bound<'_, PyDict>, key: &str, default: f64) -> item_f64_or_default(dict.get_item(key)?, default) } +#[cfg(feature = "python-bindings")] +fn parse_manifest_entry( + item: Bound<'_, PyAny>, +) -> PyResult> { + let dict = match item.cast_into::() { + Ok(dict) => dict, + Err(_) => return Ok(None), + }; + let name = match optional_string(&dict, "name")? { + Some(value) => value, + None => return Ok(None), + }; + let filename = match optional_string(&dict, "filename")? { + Some(value) => value, + None => return Ok(None), + }; + let role = match optional_string(&dict, "role")? { + Some(value) => value, + None => return Ok(None), + }; + let media_type = match optional_string(&dict, "media_type")? { + Some(value) => value, + None => return Ok(None), + }; + let entry = contracts::ArtifactManifestEntry { + name: match contracts::public_safe_text(&name, "name") { + Ok(value) => value, + Err(_) => return Ok(None), + }, + filename: match contracts::public_safe_manifest_filename(&filename) { + Ok(value) => value, + Err(_) => return Ok(None), + }, + role: match contracts::public_safe_text(&role, "role") { + Ok(value) => value, + Err(_) => return Ok(None), + }, + media_type: match contracts::public_safe_text(&media_type, "media_type") { + Ok(value) => value, + Err(_) => return Ok(None), + }, + required_for_result: optional_bool(&dict, "required_for_result", false)?, + speaker_label: match optional_string(&dict, "speaker_label")? { + Some(value) => contracts::public_safe_text(&value, "speaker_label").ok(), + None => None, + }, + }; + Ok(Some(entry)) +} + +#[cfg(feature = "python-bindings")] +fn append_manifest_entries( + py: Python<'_>, + target: &Bound<'_, PyList>, + payload: &Bound<'_, PyDict>, + category: &str, +) -> PyResult<()> { + let Some(raw_entries) = payload.get_item(category)? else { + return Ok(()); + }; + if raw_entries.is_none() { + return Ok(()); + } + let Ok(entries) = raw_entries.cast_into::() else { + return Ok(()); + }; + for item in entries.iter() { + let Some(entry) = parse_manifest_entry(item)? else { + continue; + }; + let entry_dict = PyDict::new(py); + entry_dict.set_item("name", entry.name)?; + entry_dict.set_item("filename", entry.filename)?; + entry_dict.set_item("role", entry.role)?; + entry_dict.set_item("media_type", entry.media_type)?; + entry_dict.set_item("required_for_result", entry.required_for_result)?; + if let Some(speaker_label) = entry.speaker_label { + entry_dict.set_item("speaker_label", speaker_label)?; + } + target.append(entry_dict)?; + } + Ok(()) +} + #[cfg(feature = "python-bindings")] fn parse_voiceprint_candidate( item: Bound<'_, PyAny>, @@ -225,6 +318,50 @@ fn parse_postprocess_request( Ok((aligned_segments, speaker_map)) } +#[cfg(feature = "python-bindings")] +#[pyfunction] +fn artifact_manifest_contract(py: Python<'_>, payload: &Bound<'_, PyDict>) -> PyResult> { + let response = PyDict::new(py); + let manifest_version = match optional_string(payload, "manifest_version")? { + Some(value) if !value.trim().is_empty() => { + contracts::public_safe_text(&value, "manifest_version") + .map_err(PyValueError::new_err)? + } + _ => contracts::ARTIFACT_MANIFEST_VERSION.to_string(), + }; + response.set_item("manifest_version", manifest_version)?; + + for category in contracts::ARTIFACT_MANIFEST_CATEGORIES { + let entries = PyList::empty(py); + append_manifest_entries(py, &entries, payload, category)?; + response.set_item(category, entries)?; + } + Ok(response.unbind()) +} + +#[cfg(feature = "python-bindings")] +#[pyfunction] +fn status_payload_contract(py: Python<'_>, payload: &Bound<'_, PyDict>) -> PyResult> { + let status = optional_string(payload, "status")?.unwrap_or_default(); + let updated_at = optional_string(payload, "updated_at")? + .filter(|value| !value.trim().is_empty()) + .unwrap_or_else(|| "1970-01-01T00:00:00+00:00".to_string()); + + let response = PyDict::new(py); + response.set_item("status", contracts::normalize_job_status(&status))?; + response.set_item("updated_at", updated_at)?; + match optional_string(payload, "error")? { + Some(error) => response.set_item("error", error)?, + None => response.set_item("error", py.None())?, + } + if let Some(filename) = optional_string(payload, "filename")? + .and_then(|value| contracts::public_safe_status_filename(&value)) + { + response.set_item("filename", filename)?; + } + Ok(response.unbind()) +} + #[cfg(feature = "python-bindings")] #[pyfunction] fn voiceprint_score(py: Python<'_>, payload: &Bound<'_, PyDict>) -> PyResult> { @@ -298,7 +435,9 @@ fn postprocess_segments(py: Python<'_>, payload: &Bound<'_, PyDict>) -> PyResult #[pymodule] fn voscript_core(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add("__version__", PACKAGE_VERSION)?; + module.add_function(wrap_pyfunction!(artifact_manifest_contract, module)?)?; module.add_function(wrap_pyfunction!(core_smoke, module)?)?; + module.add_function(wrap_pyfunction!(status_payload_contract, module)?)?; module.add_function(wrap_pyfunction!(voiceprint_score, module)?)?; module.add_function(wrap_pyfunction!(postprocess_segments, module)?)?; Ok(()) @@ -308,7 +447,7 @@ fn voscript_core(module: &Bound<'_, PyModule>) -> PyResult<()> { mod tests { #[test] fn package_version_is_set() { - assert_eq!(super::PACKAGE_VERSION, "0.8.2"); + assert_eq!(super::PACKAGE_VERSION, "0.8.3"); } #[test] diff --git a/doc/changelog.en.md b/doc/changelog.en.md index ddb57de..2561cef 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -22,6 +22,14 @@ `RUST_KERNEL_MODE=required` runs. The default remains Python post-processing; result segments keep stable `speaker_label` values, duplicate display names are disambiguated instead of merged, and `segments[].words` remains optional. +- Added artifact/status/schema helper contracts. Artifact manifests are + normalized through public-safe stable / optional / experimental categories, + persisted status payloads share one contract helper, and schema versions stay + optional-first for legacy `result.json` / `status.json` compatibility. +- Added optional Rust-backed artifact manifest helper validation for explicit + `RUST_KERNEL_MODE=required` runs. The default remains Python-owned contract + assembly; selected Rust helper validation must import and run successfully or + fail closed. ### Security @@ -41,6 +49,9 @@ - Extended Rust kernel and Docker smoke coverage to include result post-processing segment assembly, display-name disambiguation, and word normalization. +- Extended contract tests for legacy / unknown artifact manifests, persisted + status payloads, optional schema versions, and Rust artifact helper bridge + validation. ## 0.7.6 — Health, alignment, and embedding runtime fixes (2026-05-07) diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index 9396422..d4ecf7e 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -18,6 +18,13 @@ - 新增显式 `RUST_KERNEL_MODE=required` 下可选的 Rust-backed 结果后处理。 默认仍使用 Python 后处理;结果 segment 继续保留稳定 `speaker_label`, 重名展示名只做序号消歧而不合并 speaker,`segments[].words` 仍是可选字段。 +- 新增 artifact/status/schema helper contract。Artifact manifest 会通过 + stable / optional / experimental 三类 public-safe 结构归一化;持久化 + status payload 统一由 contract helper 构建;schema version 继续保持 + optional-first,以兼容旧 `result.json` / `status.json`。 +- 新增显式 `RUST_KERNEL_MODE=required` 下可选的 Rust-backed artifact + manifest helper 校验。默认仍由 Python 组装 contract;被选择的 Rust + helper 校验必须成功导入并执行,否则 fail closed。 ### 安全 @@ -33,6 +40,8 @@ top-2 margin 模糊拒绝以及非有限 embedding 拒绝等声纹计分 golden case。 - 扩展 Rust kernel 与 Docker smoke 覆盖,加入结果后处理 segment 组装、展示名 消歧和 word normalization。 +- 扩展 contract 测试,覆盖旧版 / 未知 artifact manifest、持久化 status + payload、可选 schema version,以及 Rust artifact helper bridge 校验。 ## 0.7.6 — 健康检查、alignment 与 embedding 运行时修复 (2026-05-07) diff --git a/doc/configuration.en.md b/doc/configuration.en.md index 44a72ad..ba1bb17 100644 --- a/doc/configuration.en.md +++ b/doc/configuration.en.md @@ -38,7 +38,7 @@ parameters yet. | `FFMPEG_TIMEOUT_SEC` | `1800` | ffmpeg conversion timeout in seconds; timeout returns `504`. | | `JOBS_MAX_CACHE` | `200` | In-memory job LRU limit. Evicted completed jobs remain queryable from disk `status.json` / `result.json`. | | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU model idle-unload timeout, defaulting to 180 seconds (3 minutes). Set `0` to disable idle unload and keep models resident. When enabled, loaded models are released only after the serialized GPU runtime has been idle for this many seconds; on the next reload, ASR, diarization, and embedding each choose the visible CUDA device with the most free memory during their own lazy load. | -| `RUST_KERNEL_MODE` | `off` | Optional Rust-backed provider/kernel mode. `off` keeps Python implementations; `required` makes selected Rust-backed paths import and run successfully or fail closed. The current selected paths are voiceprint scoring and result post-processing; CI / Docker packaging still validates the Rust extension directly when the runtime default is off. | +| `RUST_KERNEL_MODE` | `off` | Optional Rust-backed provider/kernel mode. `off` keeps Python implementations; `required` makes selected Rust-backed paths import and run successfully or fail closed. The current selected paths are voiceprint scoring, result post-processing, and artifact manifest helper contracts; CI / Docker packaging still validates the Rust extension directly when the runtime default is off. | `MODELS_DIR` and `LANGUAGE` are defined in the config module, but v0.7.6's main HTTP transcription path does not use them as stable public tuning knobs: diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md index 4a89668..f955c98 100644 --- a/doc/configuration.zh.md +++ b/doc/configuration.zh.md @@ -36,7 +36,7 @@ | `FFMPEG_TIMEOUT_SEC` | `1800` | ffmpeg 转码超时秒数,超时返回 `504`。 | | `JOBS_MAX_CACHE` | `200` | 内存 job LRU 上限;被淘汰的完成任务仍可从磁盘 `status.json` / `result.json` 查询。 | | `MODEL_IDLE_TIMEOUT_SEC` | `180` | GPU 模型空闲卸载超时,默认 180 秒(3 分钟)。设为 `0` 可关闭空闲卸载并保持模型常驻。开启后,只有串行 GPU 运行时空闲达到该秒数才释放已加载模型;下一次 reload 时 ASR、diarization 和 embedding 会在各自 lazy load 时分别选择当前可见 CUDA 中空闲显存最多的设备。 | -| `RUST_KERNEL_MODE` | `off` | 可选 Rust-backed provider/kernel 路径开关。`off` 保持 Python 实现;`required` 要求被选择的 Rust-backed 路径可导入并执行,缺失或调用失败时 fail closed。当前被选择的路径是声纹计分和结果后处理;默认关闭时,CI / Docker packaging 仍会直接验证 Rust 扩展。 | +| `RUST_KERNEL_MODE` | `off` | 可选 Rust-backed provider/kernel 路径开关。`off` 保持 Python 实现;`required` 要求被选择的 Rust-backed 路径可导入并执行,缺失或调用失败时 fail closed。当前被选择的路径是声纹计分、结果后处理和 artifact manifest helper contract;默认关闭时,CI / Docker packaging 仍会直接验证 Rust 扩展。 | `MODELS_DIR` 和 `LANGUAGE` 在配置模块里有定义,但 v0.7.6 的主 HTTP 转写路径 没有把它们作为稳定公开调参入口使用:Whisper 本地 checkpoint 查找仍使用 diff --git a/tests/unit/test_artifact_status_schema_contracts.py b/tests/unit/test_artifact_status_schema_contracts.py new file mode 100644 index 0000000..37be49d --- /dev/null +++ b/tests/unit/test_artifact_status_schema_contracts.py @@ -0,0 +1,162 @@ +"""Contract tests for artifact manifest, status, and schema helpers.""" + +from __future__ import annotations + +import pytest + +from pipeline.contracts import ( + ARTIFACT_MANIFEST_VERSION, + ArtifactManifestEntry, + attach_optional_schema_version, + build_artifact_manifest, + build_status_payload, + empty_artifact_manifest, + normalize_artifact_manifest, + normalize_status_payload, + read_optional_schema_version, +) + + +def test_artifact_manifest_builds_public_safe_known_categories_only(): + manifest = build_artifact_manifest( + stable=[ + ArtifactManifestEntry( + name="result", + filename="result.json", + role="primary_result", + media_type="application/json", + required_for_result=True, + ) + ], + optional=[ + ArtifactManifestEntry( + name="speaker_embedding", + filename="emb_SPEAKER_00.npy", + role="speaker_embedding", + media_type="application/octet-stream", + speaker_label="SPEAKER_00", + ) + ], + ) + + assert manifest == { + "manifest_version": ARTIFACT_MANIFEST_VERSION, + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ], + "optional": [ + { + "name": "speaker_embedding", + "filename": "emb_SPEAKER_00.npy", + "role": "speaker_embedding", + "media_type": "application/octet-stream", + "required_for_result": False, + "speaker_label": "SPEAKER_00", + } + ], + "experimental": [], + } + + +def test_artifact_manifest_rejects_generated_local_paths(): + with pytest.raises(ValueError, match="filename must not include"): + build_artifact_manifest( + stable=[ + ArtifactManifestEntry( + name="result", + filename="private/result.json", + role="primary_result", + media_type="application/json", + ) + ] + ) + + +def test_normalize_artifact_manifest_tolerates_legacy_unknown_and_bad_entries(): + assert normalize_artifact_manifest(None) == empty_artifact_manifest() + + manifest = normalize_artifact_manifest( + { + "manifest_version": "future.v2", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + "local_path": "private/result.json", + "speaker_id": "example-speaker-id", + }, + {"name": "bad", "filename": "../secret", "role": "x"}, + "unknown-entry", + ], + "future_category": [{"path": "private/result.json"}], + } + ) + + assert manifest["manifest_version"] == "future.v2" + assert manifest["stable"] == [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ] + assert manifest["optional"] == [] + assert manifest["experimental"] == [] + assert "local_path" not in manifest["stable"][0] + assert "speaker_id" not in manifest["stable"][0] + + +def test_status_payload_build_and_legacy_normalization_keep_shape(): + payload = build_status_payload( + "queued", + filename="private/audio.wav", + updated_at="2026-06-09T00:00:00+00:00", + ) + + assert payload == { + "status": "queued", + "updated_at": "2026-06-09T00:00:00+00:00", + "error": None, + "filename": "audio.wav", + } + + legacy = normalize_status_payload( + { + "status": "transcribing", + "updated_at": "2026-06-09T00:00:00+00:00", + "filename": "C:\\private\\legacy.wav", + "internal_path": "private/legacy.wav", + } + ) + assert legacy == { + "status": "transcribing", + "updated_at": "2026-06-09T00:00:00+00:00", + "error": None, + "filename": "legacy.wav", + } + assert normalize_status_payload({"status": "mystery"})["status"] == "failed" + + +def test_schema_version_is_optional_first_for_legacy_artifacts(): + assert read_optional_schema_version({"id": "tr_legacy"}) is None + assert read_optional_schema_version({"schema_version": "result.v1"}) == "result.v1" + assert attach_optional_schema_version({"id": "tr_legacy"}, None) == { + "id": "tr_legacy" + } + assert attach_optional_schema_version({"id": "tr_new"}, "result.v1") == { + "id": "tr_new", + "schema_version": "result.v1", + } + with pytest.raises(ValueError, match="schema_version"): + read_optional_schema_version({"schema_version": "../private"}) diff --git a/tests/unit/test_kernel_bridge.py b/tests/unit/test_kernel_bridge.py index c8037b9..be7e0b2 100644 --- a/tests/unit/test_kernel_bridge.py +++ b/tests/unit/test_kernel_bridge.py @@ -8,11 +8,13 @@ from providers.kernel_bridge import ( RustKernelBridgeError, + artifact_manifest_contract, core_smoke, postprocess_segments, require_rust_core, rust_kernel_mode, rust_provider_paths_enabled, + status_payload_contract, ) @@ -23,7 +25,7 @@ def _core_smoke(payload): return { "ok": True, "echoed": payload, - "version": "0.8.2", + "version": "0.8.3", "capabilities": {"core_smoke": True, "rust_extension": True}, } @@ -37,7 +39,7 @@ def test_core_smoke_round_trips_safe_payload_through_imported_extension(): assert result["ok"] is True assert result["echoed"] == payload - assert result["version"] == "0.8.2" + assert result["version"] == "0.8.3" assert result["capabilities"]["core_smoke"] is True @@ -137,3 +139,97 @@ def _importer(module_name): {"aligned_segments": [], "speaker_map": {}}, importer=_importer, ) + + +def test_artifact_manifest_contract_round_trips_valid_kernel_response(): + def _importer(module_name): + assert module_name == "voscript_core" + + def _artifact_manifest_contract(payload): + assert payload["stable"][0]["filename"] == "result.json" + return { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ], + "optional": [], + "experimental": [], + } + + return SimpleNamespace(artifact_manifest_contract=_artifact_manifest_contract) + + result = artifact_manifest_contract( + { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ], + "optional": [], + "experimental": [], + }, + importer=_importer, + ) + + assert result["stable"][0]["required_for_result"] is True + + +def test_artifact_manifest_contract_rejects_path_leak_response(): + def _importer(module_name): + assert module_name == "voscript_core" + return SimpleNamespace( + artifact_manifest_contract=lambda payload: { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "private/result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ], + "optional": [], + "experimental": [], + } + ) + + with pytest.raises(RustKernelBridgeError, match="filename must not expose"): + artifact_manifest_contract({"stable": []}, importer=_importer) + + +def test_status_payload_contract_round_trips_valid_kernel_response(): + def _importer(module_name): + assert module_name == "voscript_core" + return SimpleNamespace( + status_payload_contract=lambda payload: { + "status": "queued", + "updated_at": "2026-06-09T00:00:00+00:00", + "error": None, + "filename": "audio.wav", + } + ) + + result = status_payload_contract( + { + "status": "queued", + "updated_at": "2026-06-09T00:00:00+00:00", + "filename": "audio.wav", + }, + importer=_importer, + ) + + assert result["status"] == "queued" + assert result["error"] is None + assert result["filename"] == "audio.wav" diff --git a/tests/unit/test_pipeline_runner.py b/tests/unit/test_pipeline_runner.py index c809d58..2b47732 100644 --- a/tests/unit/test_pipeline_runner.py +++ b/tests/unit/test_pipeline_runner.py @@ -633,6 +633,27 @@ def fake_postprocess_segments(payload): assert unique_speakers == ["SPEAKER_00"] +def test_artifact_manifest_uses_rust_contract_when_required(monkeypatch): + calls = [] + + def fake_artifact_manifest_contract(payload): + calls.append(payload) + return payload + + monkeypatch.setattr(artifacts_default, "rust_provider_paths_enabled", lambda: True) + monkeypatch.setattr( + artifacts_default, + "artifact_manifest_contract", + fake_artifact_manifest_contract, + ) + + manifest = InMemoryArtifactsProvider._build_artifact_manifest(["SPEAKER_00"]) + + assert calls == [manifest] + assert manifest["stable"][0]["filename"] == "result.json" + assert manifest["stable"][1]["filename"] == "emb_SPEAKER_00.npy" + + def test_artifact_result_contract_keeps_status_speaker_label_and_optional_alignment( tmp_path, ):