diff --git a/config/config.exs b/config/config.exs index 2fe7409..544976e 100644 --- a/config/config.exs +++ b/config/config.exs @@ -295,6 +295,15 @@ config :loopctl, :knowledge_moc_excluded_tags, ~w(synology-docs synology-netback config :loopctl, :knowledge_proposal_duplicate_threshold, 0.97 config :loopctl, :knowledge_proposal_overlap_threshold, 0.88 +# Route-the-findings (#4): two published articles whose cosine similarity is at/above +# this are "too similar to comfortably coexist" — flagged with a `:potential_conflict` +# link for the consuming agent to resolve (merge a redundancy / reconcile a real +# contradiction). The KB only flags; it never judges which it is. +config :loopctl, :knowledge_conflict_threshold, 0.93 +# Max `:relates_to`→`:potential_conflict` promotions the nightly lint sweep does per +# tenant per run (bounds the existing-corpus backfill; it cycles over nights). +config :loopctl, :knowledge_lint_max_conflict_promotions, 500 + # DI: WebAuthn adapter — defaults to Wax (overridden in test env) config :loopctl, :webauthn_adapter, Loopctl.WebAuthn.Wax diff --git a/lib/loopctl/knowledge/article_link.ex b/lib/loopctl/knowledge/article_link.ex index 052b722..df612d7 100644 --- a/lib/loopctl/knowledge/article_link.ex +++ b/lib/loopctl/knowledge/article_link.ex @@ -13,7 +13,11 @@ defmodule Loopctl.Knowledge.ArticleLink do - `tenant_id` -- FK to tenants (set programmatically, never cast) - `source_article_id` -- FK to articles (the origin of the link) - `target_article_id` -- FK to articles (the destination of the link) - - `relationship_type` -- enum: relates_to, derived_from, contradicts, supersedes + - `relationship_type` -- enum: relates_to, derived_from, contradicts, supersedes, + potential_conflict (a MECHANICAL "too similar to comfortably coexist" flag — + NOT an assertion that the two disagree; the consuming agent judges whether it's a + redundancy to merge or a real contradiction. Distinct from `contradicts`, which + asserts a known disagreement.) - `metadata` -- extensible JSONB, defaults to `%{}` - `inserted_at` -- creation timestamp (no updated_at) @@ -29,7 +33,13 @@ defmodule Loopctl.Knowledge.ArticleLink do @type t :: %__MODULE__{} - @relationship_type_values [:relates_to, :derived_from, :contradicts, :supersedes] + @relationship_type_values [ + :relates_to, + :derived_from, + :contradicts, + :supersedes, + :potential_conflict + ] schema "article_links" do tenant_field() diff --git a/lib/loopctl/workers/article_linking_worker.ex b/lib/loopctl/workers/article_linking_worker.ex index 95220e1..c0e47a0 100644 --- a/lib/loopctl/workers/article_linking_worker.ex +++ b/lib/loopctl/workers/article_linking_worker.ex @@ -129,28 +129,47 @@ defmodule Loopctl.Workers.ArticleLinkingWorker do log_if_exceeds_limit(article, tenant_id, max_comparisons) candidates = find_similar_articles(article, tenant_id, threshold, max_comparisons) - existing_pairs = get_existing_link_pairs(article.id, tenant_id) - - new_links = - candidates - |> Enum.reject(fn %{id: cid} -> - MapSet.member?(existing_pairs, {article.id, cid}) or - MapSet.member?(existing_pairs, {cid, article.id}) - end) - |> Enum.map(fn %{id: target_id, similarity: score} -> - %{ - source_article_id: article.id, - target_article_id: target_id, - relationship_type: :relates_to, - metadata: %{"auto_generated" => true, "similarity_score" => score} - } + conflict_threshold = conflict_threshold() + + # A `relates_to` ambient link for everything >= the link threshold, PLUS a + # `:potential_conflict` flag (route-the-findings #4) for pairs >= the conflict + # threshold — too similar to comfortably coexist, for the consumer to resolve. + # Dedup is type-aware so the two link types don't crowd each other out. + relates = + build_links(article.id, candidates, tenant_id, :relates_to, fn _sim -> true end) + + conflicts = + build_links(article.id, candidates, tenant_id, :potential_conflict, fn sim -> + sim >= conflict_threshold end) - created_count = create_links(new_links, tenant_id) + created_count = create_links(relates ++ conflicts, tenant_id) log_audit_event(article.id, tenant_id, created_count) :ok end + defp build_links(article_id, candidates, tenant_id, type, keep?) do + existing = get_existing_link_pairs(article_id, tenant_id, type) + + candidates + |> Enum.filter(fn %{similarity: sim} -> keep?.(sim) end) + |> Enum.reject(fn %{id: cid} -> + MapSet.member?(existing, {article_id, cid}) or MapSet.member?(existing, {cid, article_id}) + end) + |> Enum.map(fn %{id: target_id, similarity: score} -> + %{ + source_article_id: article_id, + target_article_id: target_id, + relationship_type: type, + metadata: %{"auto_generated" => true, "similarity_score" => score} + } + end) + end + + defp conflict_threshold do + Application.get_env(:loopctl, :knowledge_conflict_threshold, 0.93) + end + # US-27.7a: route the similarity lookup through the shared, scale-tested kNN helper # (`Loopctl.Knowledge.VectorSearch.nearest/4`) on the dedicated HeavyRead pool instead # of a bespoke `AdminRepo` cosine query. The helper's index-correct shape guarantees a @@ -224,9 +243,10 @@ defmodule Loopctl.Workers.ArticleLinkingWorker do where(query, [a], is_nil(a.project_id) or a.project_id == ^project_id) end - defp get_existing_link_pairs(article_id, tenant_id) do + defp get_existing_link_pairs(article_id, tenant_id, type) do from(l in ArticleLink, where: l.tenant_id == ^tenant_id, + where: l.relationship_type == ^type, where: l.source_article_id == ^article_id or l.target_article_id == ^article_id, select: {l.source_article_id, l.target_article_id} ) diff --git a/lib/loopctl/workers/knowledge_lint_worker.ex b/lib/loopctl/workers/knowledge_lint_worker.ex index 73fff6c..4197356 100644 --- a/lib/loopctl/workers/knowledge_lint_worker.ex +++ b/lib/loopctl/workers/knowledge_lint_worker.ex @@ -58,6 +58,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do alias Loopctl.Audit alias Loopctl.Knowledge alias Loopctl.Knowledge.Article + alias Loopctl.Knowledge.ArticleLink alias Loopctl.Tenants.Tenant alias Loopctl.Workers.ArticleEmbeddingWorker alias Loopctl.Workers.ArticleLinkingWorker @@ -71,6 +72,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do # 0.6 leaves them isolated forever. Re-link orphans at a LOWER threshold so an # isolated article connects to its closest relative rather than dangling. @default_orphan_link_threshold 0.5 + @default_max_conflict_promotions 500 @impl Oban.Worker def perform(%Oban.Job{args: %{"mode" => "all_tenants"}}) do @@ -91,11 +93,13 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do {:ok, report} = Knowledge.lint(tenant_id, max_per_category: @lint_max_per_category) action = act_on_orphans(tenant_id, report) - log_audit_event(tenant_id, report, action) + promoted = promote_conflicts(tenant_id) + log_audit_event(tenant_id, report, action, promoted) Logger.info( "KnowledgeLintWorker: tenant=#{tenant_id} issues=#{report.summary.total_issues} " <> - "orphans_relinked=#{action.relinked} orphans_embedding_enqueued=#{action.embedding_enqueued}" + "orphans_relinked=#{action.relinked} orphans_embedding_enqueued=#{action.embedding_enqueued} " <> + "conflicts_promoted=#{promoted}" ) :ok @@ -150,6 +154,71 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do %{relinked: length(with_embedding), embedding_enqueued: length(without_embedding)} end + # Route-the-findings (#4), existing-corpus backstop: the auto-linker stored the + # cosine `similarity_score` on every ambient `:relates_to` link. Any such link at or + # above the conflict threshold is a "too similar to coexist" pair that predates the + # forward detection — promote it to also carry a `:potential_conflict` flag (no new + # embedding calls). Bounded per run; cycles the corpus over nights. Idempotent: a + # pair already flagged (either direction) is skipped. + defp promote_conflicts(tenant_id) do + threshold = Application.get_env(:loopctl, :knowledge_conflict_threshold, 0.93) + + cap = + Application.get_env( + :loopctl, + :knowledge_lint_max_conflict_promotions, + @default_max_conflict_promotions + ) + + candidates = + from(l in ArticleLink, + as: :rel, + where: l.tenant_id == ^tenant_id, + where: l.relationship_type == :relates_to, + where: fragment("(?->>'similarity_score')::float >= ?", l.metadata, ^threshold), + where: + not exists( + from(pc in ArticleLink, + where: + pc.tenant_id == parent_as(:rel).tenant_id and + pc.relationship_type == :potential_conflict and + ((pc.source_article_id == parent_as(:rel).source_article_id and + pc.target_article_id == parent_as(:rel).target_article_id) or + (pc.source_article_id == parent_as(:rel).target_article_id and + pc.target_article_id == parent_as(:rel).source_article_id)) + ) + ), + select: %{ + source_article_id: l.source_article_id, + target_article_id: l.target_article_id, + metadata: l.metadata + }, + limit: ^cap + ) + |> AdminRepo.all() + + Enum.reduce(candidates, 0, fn c, count -> + attrs = %{ + source_article_id: c.source_article_id, + target_article_id: c.target_article_id, + relationship_type: :potential_conflict, + metadata: %{ + "auto_generated" => true, + "similarity_score" => c.metadata["similarity_score"], + "promoted_from" => "relates_to" + } + } + + changeset = ArticleLink.changeset(%ArticleLink{tenant_id: tenant_id}, attrs) + + case AdminRepo.insert(changeset) do + {:ok, _} -> count + 1 + # Lost a race / already exists — skip, stay idempotent. + {:error, _} -> count + end + end) + end + defp embedded_ids(_tenant_id, []), do: MapSet.new() defp embedded_ids(tenant_id, orphan_ids) do @@ -163,7 +232,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do |> MapSet.new() end - defp log_audit_event(tenant_id, report, action) do + defp log_audit_event(tenant_id, report, action, promoted) do Audit.create_log_entry(tenant_id, %{ entity_type: "knowledge_lint", entity_id: tenant_id, @@ -174,7 +243,8 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do new_state: %{ "summary" => report.summary, "orphans_relinked" => action.relinked, - "orphans_embedding_enqueued" => action.embedding_enqueued + "orphans_embedding_enqueued" => action.embedding_enqueued, + "conflicts_promoted" => promoted } }) end diff --git a/test/loopctl/workers/article_linking_worker_test.exs b/test/loopctl/workers/article_linking_worker_test.exs index e73aa3b..8be9cbe 100644 --- a/test/loopctl/workers/article_linking_worker_test.exs +++ b/test/loopctl/workers/article_linking_worker_test.exs @@ -64,6 +64,56 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do List.duplicate(1.0, 768) ++ List.duplicate(1.518, 768) end + # cos(similar_embedding, this) = 768 / (sqrt(768) * sqrt(1536)) ~= 0.707 — related + # (>= 0.6) but below the 0.93 conflict threshold. + defp moderately_similar_embedding do + List.duplicate(1.0, 1536) + end + + defp links_of_type(tenant_id, a_id, b_id, type) do + from(l in ArticleLink, + where: l.tenant_id == ^tenant_id, + where: l.relationship_type == ^type, + where: + (l.source_article_id == ^a_id and l.target_article_id == ^b_id) or + (l.source_article_id == ^b_id and l.target_article_id == ^a_id) + ) + |> AdminRepo.all() + end + + describe "potential conflict detection (#4)" do + test "flags a near-identical pair with a :potential_conflict link" do + %{tenant: tenant} = setup_tenant() + source = create_article_with_embedding(tenant.id, similar_embedding()) + dup = create_article_with_embedding(tenant.id, near_similar_embedding()) + + assert :ok = + ArticleLinkingWorker.perform(%Oban.Job{ + args: %{"article_id" => source.id, "tenant_id" => tenant.id} + }) + + # Both an ambient relates_to AND the conflict flag. + assert [_] = links_of_type(tenant.id, source.id, dup.id, :relates_to) + assert [conflict] = links_of_type(tenant.id, source.id, dup.id, :potential_conflict) + assert conflict.metadata["similarity_score"] >= 0.93 + assert conflict.metadata["auto_generated"] == true + end + + test "a merely-related pair (below the conflict threshold) gets NO conflict flag" do + %{tenant: tenant} = setup_tenant() + source = create_article_with_embedding(tenant.id, similar_embedding()) + related = create_article_with_embedding(tenant.id, moderately_similar_embedding()) + + assert :ok = + ArticleLinkingWorker.perform(%Oban.Job{ + args: %{"article_id" => source.id, "tenant_id" => tenant.id} + }) + + assert [_] = links_of_type(tenant.id, source.id, related.id, :relates_to) + assert [] == links_of_type(tenant.id, source.id, related.id, :potential_conflict) + end + end + # --- TC-21.2.1: Creates relates_to links for similar articles --- describe "perform/1 creates links" do @@ -226,13 +276,15 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do links = from(l in ArticleLink, where: l.tenant_id == ^tenant.id, + where: l.relationship_type == :relates_to, where: (l.source_article_id == ^article_a.id and l.target_article_id == ^article_b.id) or (l.source_article_id == ^article_b.id and l.target_article_id == ^article_a.id) ) |> AdminRepo.all() - # Only the manually created one should exist + # Only the manually created relates_to one should exist (a high-similarity pair + # also gets a separate :potential_conflict flag — that's #4, asserted elsewhere). assert length(links) == 1 end end @@ -301,10 +353,12 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do args: %{"article_id" => source.id, "tenant_id" => tenant.id} }) - # All 3 similar articles should be linked (within default limit of 50) + # All 3 similar articles should be linked (within default limit of 50). Scope to + # :relates_to — high-similarity pairs also get a :potential_conflict link (#4). link_count = from(l in ArticleLink, where: l.tenant_id == ^tenant.id, + where: l.relationship_type == :relates_to, where: l.source_article_id == ^source.id ) |> AdminRepo.aggregate(:count) @@ -419,7 +473,9 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do assert audit.actor_type == "system" assert audit.actor_label == "worker:article_linking" assert audit.new_state["article_id"] == source.id - assert audit.new_state["new_link_count"] == 1 + # 2 links: a :relates_to and, since the pair is near-identical (>= conflict + # threshold), a :potential_conflict flag (#4) — both are auto-links created here. + assert audit.new_state["new_link_count"] == 2 end end diff --git a/test/loopctl/workers/knowledge_lint_worker_test.exs b/test/loopctl/workers/knowledge_lint_worker_test.exs index c55c344..793aa0b 100644 --- a/test/loopctl/workers/knowledge_lint_worker_test.exs +++ b/test/loopctl/workers/knowledge_lint_worker_test.exs @@ -179,4 +179,69 @@ defmodule Loopctl.Workers.KnowledgeLintWorkerTest do assert [] == lint_audit_entries(tenant_b.id) end end + + describe "conflict promotion (#4 existing-corpus backstop)" do + defp relates_link(tenant_id, src_id, tgt_id, score) do + %ArticleLink{tenant_id: tenant_id} + |> ArticleLink.changeset(%{ + source_article_id: src_id, + target_article_id: tgt_id, + relationship_type: :relates_to, + metadata: %{"auto_generated" => true, "similarity_score" => score} + }) + |> AdminRepo.insert!() + end + + defp conflict_links(tenant_id, a_id, b_id) do + from(l in ArticleLink, + where: l.tenant_id == ^tenant_id, + where: l.relationship_type == :potential_conflict, + where: + (l.source_article_id == ^a_id and l.target_article_id == ^b_id) or + (l.source_article_id == ^b_id and l.target_article_id == ^a_id) + ) + |> AdminRepo.all() + end + + test "promotes a high-similarity relates_to link to a :potential_conflict flag" do + tenant = fixture(:tenant) + a = published_article_with_embedding(tenant.id, similar_embedding()) + b = published_article_with_embedding(tenant.id, near_similar_embedding()) + relates_link(tenant.id, a.id, b.id, 0.95) + + assert :ok = KnowledgeLintWorker.perform(%Oban.Job{args: %{"tenant_id" => tenant.id}}) + + assert [conflict] = conflict_links(tenant.id, a.id, b.id) + assert conflict.metadata["promoted_from"] == "relates_to" + assert conflict.metadata["similarity_score"] == 0.95 + + assert [entry] = lint_audit_entries(tenant.id) + assert entry.new_state["conflicts_promoted"] == 1 + end + + test "leaves a below-threshold relates_to link alone" do + tenant = fixture(:tenant) + a = published_article_with_embedding(tenant.id, similar_embedding()) + b = published_article_with_embedding(tenant.id, near_similar_embedding()) + relates_link(tenant.id, a.id, b.id, 0.80) + + assert :ok = KnowledgeLintWorker.perform(%Oban.Job{args: %{"tenant_id" => tenant.id}}) + + assert [] == conflict_links(tenant.id, a.id, b.id) + end + + test "is idempotent — a second run promotes nothing new" do + tenant = fixture(:tenant) + a = published_article_with_embedding(tenant.id, similar_embedding()) + b = published_article_with_embedding(tenant.id, near_similar_embedding()) + relates_link(tenant.id, a.id, b.id, 0.96) + + assert :ok = KnowledgeLintWorker.perform(%Oban.Job{args: %{"tenant_id" => tenant.id}}) + assert :ok = KnowledgeLintWorker.perform(%Oban.Job{args: %{"tenant_id" => tenant.id}}) + + assert [_only_one] = conflict_links(tenant.id, a.id, b.id) + assert [_, second] = lint_audit_entries(tenant.id) |> Enum.sort_by(& &1.inserted_at) + assert second.new_state["conflicts_promoted"] == 0 + end + end end