Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 236 additions & 0 deletions .github/scripts/extract_spec_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
#!/usr/bin/env python3
"""Diff two OpenAPI specs; classify each change as breaking or non-breaking.

Emits a JSON list of changes. Operations are keyed by operationId (falling back
to "<METHOD> <path>"); schemas by name (properties as "Schema.prop"). Used by the
spec-drift-detector workflow in both PR mode (git refs) and scheduled mode (files).

Zero runtime dependencies beyond pyyaml.
"""
import argparse
import json
import subprocess
from pathlib import Path

import yaml

METHODS = {"get", "post", "put", "patch", "delete", "head", "options"}


def load(text: str) -> dict:
return yaml.safe_load(text) or {}


def git_show(ref: str, path: str) -> str:
"""File contents at a git ref, or '' if absent on that side (new/deleted file)."""
try:
return subprocess.run(
["git", "show", f"{ref}:{path}"], capture_output=True, text=True, check=True
).stdout
except subprocess.CalledProcessError:
return ""


def operations(spec: dict) -> dict:
"""{op_key: {method, path, op}} for every paths.<path>.<method>."""
out = {}
for path, methods in (spec.get("paths") or {}).items():
if not isinstance(methods, dict):
continue
for m, op in methods.items():
if m.lower() not in METHODS or not isinstance(op, dict):
continue
key = op.get("operationId") or f"{m.upper()} {path}"
out[key] = {"method": m.upper(), "path": path, "op": op}
return out


def params(op: dict) -> dict:
out = {}
for p in op.get("parameters") or []:
if isinstance(p, dict) and "name" in p:
out[(p["name"], p.get("in", ""))] = p
return out


def schemas(spec: dict) -> dict:
return ((spec.get("components") or {}).get("schemas")) or {}


def _type(d):
if not isinstance(d, dict):
return None
s = d.get("schema", d)
if not isinstance(s, dict):
return None
return s.get("type") or s.get("$ref")


def _fmt(d):
"""The `format` qualifier (e.g. int32, uuid) of a parameter/property schema."""
if not isinstance(d, dict):
return None
s = d.get("schema", d)
if not isinstance(s, dict):
return None
return s.get("format")


def diff_operations(base: dict, head: dict) -> list[dict]:
changes = []
bo, ho = operations(base), operations(head)
for k in bo.keys() - ho.keys():
changes.append({"kind": "operation", "id": k, "change": "removed", "breaking": True,
"before": f"{bo[k]['method']} {bo[k]['path']}", "after": "",
"detail": "operation removed"})
for k in ho.keys() - bo.keys():
changes.append({"kind": "operation", "id": k, "change": "added", "breaking": False,
"before": "", "after": f"{ho[k]['method']} {ho[k]['path']}",
"detail": "operation added"})
for k in bo.keys() & ho.keys():
changes += diff_one_op(k, bo[k]["op"], ho[k]["op"])
return changes


def diff_one_op(key: str, b: dict, h: dict) -> list[dict]:
changes = []
bp, hp = params(b), params(h)

# Parameter location changes (e.g. query -> path): a pure relocation otherwise
# shows up as a misleading remove + add. Detect by name and report once, breaking.
b_loc, h_loc = {}, {}
for nm, loc in bp:
b_loc.setdefault(nm, set()).add(loc)
for nm, loc in hp:
h_loc.setdefault(nm, set()).add(loc)
relocated = set()
for nm in b_loc.keys() & h_loc.keys():
if b_loc[nm] != h_loc[nm] and len(b_loc[nm]) == 1 and len(h_loc[nm]) == 1:
relocated.add(nm)
bl, hl = next(iter(b_loc[nm])), next(iter(h_loc[nm]))
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": True,
"before": f"{nm} in {bl}", "after": f"{nm} in {hl}",
"detail": f"parameter '{nm}' moved from {bl} to {hl}"})

for name in bp.keys() - hp.keys():
if name[0] in relocated:
continue
req = bool(bp[name].get("required"))
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": req,
"before": f"param {name[0]}", "after": "",
"detail": f"{'required ' if req else ''}parameter '{name[0]}' removed"})
for name in hp.keys() - bp.keys():
if name[0] in relocated:
continue
req = bool(hp[name].get("required"))
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": req,
"before": "", "after": f"param {name[0]}",
"detail": f"{'required ' if req else ''}parameter '{name[0]}' added"})
for name in bp.keys() & hp.keys():
pb, ph = bp[name], hp[name]
if not pb.get("required") and ph.get("required"):
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": True,
"before": f"{name[0]} optional", "after": f"{name[0]} required",
"detail": f"parameter '{name[0]}' now required"})
if _type(pb) != _type(ph):
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": True,
"before": str(_type(pb)), "after": str(_type(ph)),
"detail": f"parameter '{name[0]}' type changed"})
if _fmt(pb) != _fmt(ph):
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": True,
"before": str(_fmt(pb)), "after": str(_fmt(ph)),
"detail": f"parameter '{name[0]}' format changed"})

# Removed response codes: consumers may branch on them, so removal is breaking.
br = {str(c) for c in (b.get("responses") or {})}
hr = {str(c) for c in (h.get("responses") or {})}
for code in br - hr:
changes.append({"kind": "operation", "id": key, "change": "modified", "breaking": True,
"before": f"response {code}", "after": "",
"detail": f"response '{code}' removed"})
return changes


def diff_schemas(base: dict, head: dict) -> list[dict]:
changes = []
bs, hs = schemas(base), schemas(head)
for name in bs.keys() - hs.keys():
changes.append({"kind": "schema", "id": name, "change": "removed", "breaking": True,
"before": name, "after": "", "detail": "schema removed"})
for name in hs.keys() - bs.keys():
changes.append({"kind": "schema", "id": name, "change": "added", "breaking": False,
"before": "", "after": name, "detail": "schema added"})
for name in bs.keys() & hs.keys():
changes += diff_one_schema(name, bs[name], hs[name])
return changes


def diff_one_schema(name: str, b: dict, h: dict) -> list[dict]:
changes = []
bp = b.get("properties") or {}
hp = h.get("properties") or {}
breq, hreq = set(b.get("required") or []), set(h.get("required") or [])
for prop in bp.keys() - hp.keys():
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "removed", "breaking": True,
"before": prop, "after": "", "detail": f"property '{prop}' removed"})
for prop in hp.keys() - bp.keys():
newreq = prop in hreq
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "added", "breaking": newreq,
"before": "", "after": prop,
"detail": f"{'required ' if newreq else ''}property '{prop}' added"})
for prop in bp.keys() & hp.keys():
if prop not in breq and prop in hreq:
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "modified", "breaking": True,
"before": "optional", "after": "required",
"detail": f"property '{prop}' now required"})
if _type(bp[prop]) != _type(hp[prop]):
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "modified", "breaking": True,
"before": str(_type(bp[prop])), "after": str(_type(hp[prop])),
"detail": f"property '{prop}' type changed"})
if _fmt(bp[prop]) != _fmt(hp[prop]):
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "modified", "breaking": True,
"before": str(_fmt(bp[prop])), "after": str(_fmt(hp[prop])),
"detail": f"property '{prop}' format changed"})
be = set(bp[prop].get("enum") or []) if isinstance(bp[prop], dict) else set()
he = set(hp[prop].get("enum") or []) if isinstance(hp[prop], dict) else set()
for removed in be - he:
changes.append({"kind": "schema", "id": f"{name}.{prop}", "change": "modified", "breaking": True,
"before": str(removed), "after": "",
"detail": f"enum value '{removed}' removed from '{prop}'"})
return changes


def diff(base: dict, head: dict) -> list[dict]:
return diff_operations(base, head) + diff_schemas(base, head)


def main():
ap = argparse.ArgumentParser(description="Diff two OpenAPI specs.")
ap.add_argument("--base", help="Base spec file path")
ap.add_argument("--head", help="Head spec file path")
ap.add_argument("--base-ref", help="Base git ref (use with --path)")
ap.add_argument("--head-ref", help="Head git ref (use with --path)")
ap.add_argument("--path", help="Repo-relative spec path for git-ref mode")
ap.add_argument("--service", default="")
ap.add_argument("--version", default="")
ap.add_argument("--output", default="spec-diff.json")
a = ap.parse_args()

if a.base_ref is not None and a.path:
base = load(git_show(a.base_ref, a.path))
head = load(git_show(a.head_ref, a.path))
else:
base = load(Path(a.base).read_text()) if a.base and Path(a.base).exists() else {}
head = load(Path(a.head).read_text()) if a.head and Path(a.head).exists() else {}

changes = diff(base, head)
for c in changes:
c["service"] = a.service
c["version"] = a.version
Path(a.output).write_text(json.dumps(changes, indent=2))
nb = sum(1 for c in changes if c["breaking"])
print(f"{len(changes)} changes ({nb} breaking) -> {a.output}")


if __name__ == "__main__":
main()
107 changes: 107 additions & 0 deletions .github/scripts/tests/test_extract_spec_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Unit tests for extract_spec_diff. Run: pytest .github/scripts/tests -v
(add .github/scripts to sys.path or run from there)."""
import os
import subprocess
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from extract_spec_diff import diff_operations, diff_one_op, diff_schemas # noqa: E402


def test_operation_added_and_removed():
base = {"paths": {"/indexes": {"get": {"operationId": "list_indexes"},
"post": {"operationId": "create_index"}}}}
head = {"paths": {"/indexes": {"get": {"operationId": "list_indexes"}},
"/indexes/{name}": {"delete": {"operationId": "delete_index"}}}}
by = {(c["id"], c["change"]): c for c in diff_operations(base, head)}
assert by[("create_index", "removed")]["breaking"] is True
assert by[("delete_index", "added")]["breaking"] is False


def test_parameter_breaking_rules():
b = {"parameters": [{"name": "limit", "in": "query", "schema": {"type": "integer"}}]}
h = {"parameters": [
{"name": "limit", "in": "query", "schema": {"type": "string"}},
{"name": "namespace", "in": "query", "required": True, "schema": {"type": "string"}},
]}
details = {c["detail"]: c["breaking"] for c in diff_one_op("op", b, h)}
assert details["parameter 'limit' type changed"] is True
assert details["required parameter 'namespace' added"] is True


def test_optional_param_added_is_not_breaking():
b = {"parameters": []}
h = {"parameters": [{"name": "filter", "in": "query", "schema": {"type": "string"}}]}
c = diff_one_op("op", b, h)[0]
assert c["detail"] == "parameter 'filter' added"
assert c["breaking"] is False


def test_param_format_change_is_breaking():
b = {"parameters": [{"name": "id", "in": "query", "schema": {"type": "integer", "format": "int32"}}]}
h = {"parameters": [{"name": "id", "in": "query", "schema": {"type": "integer", "format": "int64"}}]}
details = {c["detail"]: c["breaking"] for c in diff_one_op("op", b, h)}
assert details["parameter 'id' format changed"] is True
assert "parameter 'id' type changed" not in details # type unchanged, only format


def test_param_location_change_reported_once_as_breaking():
b = {"parameters": [{"name": "name", "in": "query", "required": True, "schema": {"type": "string"}}]}
h = {"parameters": [{"name": "name", "in": "path", "required": True, "schema": {"type": "string"}}]}
out = diff_one_op("op", b, h)
details = {c["detail"]: c["breaking"] for c in out}
assert details["parameter 'name' moved from query to path"] is True
# the relocation must NOT also surface as a remove + add
assert not any("removed" in d or "added" in d for d in details)


def test_removed_response_code_is_breaking():
b = {"responses": {"200": {}, "404": {}}}
h = {"responses": {"200": {}}}
details = {c["detail"]: c["breaking"] for c in diff_one_op("op", b, h)}
assert details["response '404' removed"] is True


def test_added_response_code_is_not_flagged():
b = {"responses": {"200": {}}}
h = {"responses": {"200": {}, "429": {}}}
assert all("response" not in c["detail"] for c in diff_one_op("op", b, h))


def test_schema_property_format_change_is_breaking():
base = {"components": {"schemas": {"M": {"properties": {"ts": {"type": "string", "format": "date"}}}}}}
head = {"components": {"schemas": {"M": {"properties": {"ts": {"type": "string", "format": "date-time"}}}}}}
d = {c["detail"]: c["breaking"] for c in diff_schemas(base, head)}
assert d["property 'ts' format changed"] is True


def test_schema_breaking_rules():
base = {"components": {"schemas": {"Index": {
"properties": {"name": {"type": "string"},
"metric": {"type": "string", "enum": ["cosine", "dotproduct"]}},
"required": ["name"]}}}}
head = {"components": {"schemas": {"Index": {
"properties": {"metric": {"type": "integer", "enum": ["cosine"]},
"host": {"type": "string"}},
"required": ["host"]}}}}
d = {c["detail"]: c["breaking"] for c in diff_schemas(base, head)}
assert d["property 'name' removed"] is True
assert d["required property 'host' added"] is True
assert d["property 'metric' type changed"] is True
assert d["enum value 'dotproduct' removed from 'metric'"] is True


def test_cli_on_files(tmp_path):
import json
base = tmp_path / "base.yaml"
head = tmp_path / "head.yaml"
out = tmp_path / "diff.json"
base.write_text("paths:\n /x:\n get:\n operationId: getx\n")
head.write_text("paths: {}\n")
script = os.path.join(os.path.dirname(__file__), "..", "extract_spec_diff.py")
r = subprocess.run([sys.executable, script, "--base", str(base), "--head", str(head),
"--service", "db_data", "--version", "2025-10", "--output", str(out)],
capture_output=True, text=True)
assert r.returncode == 0, r.stderr
changes = json.loads(out.read_text())
assert any(c["id"] == "getx" and c["change"] == "removed" and c["service"] == "db_data" for c in changes)
39 changes: 39 additions & 0 deletions .github/spec-manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"_comment": "Maps pinecone-api operationIds (and schema names) to the docs.pinecone.io guide pages and SDKs they surface. Edit when operations/schemas are added or pages renamed. Changed surface absent here is written to spec-gaps-unmapped.json on each run — tend that file to keep this current.",
"docs_base_url": "https://docs.pinecone.io",
"operations": {
"create_index": {"docs": ["guides/index-data/create-an-index"], "sdks": ["python", "ts", "go", "java"]},
"create_index_for_model": {"docs": ["guides/index-data/create-an-index", "guides/get-started/quickstart"], "sdks": ["python", "ts", "go", "java"]},
"list_indexes": {"docs": ["guides/manage-data/manage-indexes"], "sdks": ["python", "ts", "go", "java"]},
"describe_index": {"docs": ["guides/manage-data/manage-indexes"], "sdks": ["python", "ts", "go", "java"]},
"configure_index": {"docs": ["guides/manage-data/manage-indexes"], "sdks": ["python", "ts", "go", "java"]},
"delete_index": {"docs": ["guides/manage-data/manage-indexes"], "sdks": ["python", "ts", "go", "java"]},
"upsertVectors": {"docs": ["guides/index-data/upsert-data"], "sdks": ["python", "ts", "go", "java"]},
"upsertRecordsNamespace": {"docs": ["guides/index-data/upsert-data", "guides/get-started/quickstart"], "sdks": ["python", "ts"]},
"searchRecordsNamespace": {"docs": ["guides/search/search-overview", "guides/get-started/quickstart"], "sdks": ["python", "ts"]},
"queryVectors": {"docs": ["guides/search/search-overview"], "sdks": ["python", "ts", "go", "java"]},
"fetchVectors": {"docs": ["guides/manage-data/fetch-data"], "sdks": ["python", "ts", "go", "java"]},
"updateVector": {"docs": ["guides/manage-data/update-data"], "sdks": ["python", "ts", "go", "java"]},
"deleteVectors": {"docs": ["guides/manage-data/delete-data"], "sdks": ["python", "ts", "go", "java"]},
"listVectors": {"docs": ["guides/manage-data/list-record-ids"], "sdks": ["python", "ts", "go", "java"]},
"describeIndexStats": {"docs": ["guides/manage-data/manage-indexes"], "sdks": ["python", "ts", "go", "java"]},
"startBulkImport": {"docs": ["guides/index-data/import-data"], "sdks": ["python", "ts"]},
"describeBulkImport": {"docs": ["guides/index-data/import-data"], "sdks": ["python", "ts"]},
"listBulkImports": {"docs": ["guides/index-data/import-data"], "sdks": ["python", "ts"]},
"cancelBulkImport": {"docs": ["guides/index-data/import-data"], "sdks": ["python", "ts"]},
"create_backup": {"docs": ["guides/manage-data/back-up-an-index"], "sdks": ["python", "ts", "go", "java"]},
"create_index_from_backup_operation": {"docs": ["guides/manage-data/restore-an-index"], "sdks": ["python", "ts", "go", "java"]},
"create_collection": {"docs": ["guides/manage-data/manage-collections"], "sdks": ["python", "ts", "go", "java"]},
"createNamespace": {"docs": ["guides/manage-data/manage-namespaces"], "sdks": ["python", "ts", "go", "java"]},
"deleteNamespace": {"docs": ["guides/manage-data/manage-namespaces"], "sdks": ["python", "ts", "go", "java"]},
"embed": {"docs": ["guides/inference/generate-embeddings"], "sdks": ["python", "ts", "go", "java"]},
"rerank": {"docs": ["guides/search/rerank-results"], "sdks": ["python", "ts", "go", "java"]},
"list_models": {"docs": ["guides/inference/understanding-inference"], "sdks": ["python", "ts"]},
"get_model": {"docs": ["guides/inference/understanding-inference"], "sdks": ["python", "ts"]}
},
"schemas": {
"IndexModel": {"docs": ["guides/index-data/create-an-index"], "sdks": ["python", "ts", "go", "java"]},
"CreateIndexForModelRequest": {"docs": ["guides/index-data/create-an-index"], "sdks": ["python", "ts"]},
"SearchRecordsRequest": {"docs": ["guides/search/search-overview"], "sdks": ["python", "ts"]}
}
}