Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,15 @@ config :loopctl, :knowledge_moc_excluded_tags, ~w(synology-docs synology-netback
config :loopctl, :knowledge_proposal_duplicate_threshold, 0.97
config :loopctl, :knowledge_proposal_overlap_threshold, 0.88

# Route-the-findings (#4): two published articles whose cosine similarity is at/above
# this are "too similar to comfortably coexist" — flagged with a `:potential_conflict`
# link for the consuming agent to resolve (merge a redundancy / reconcile a real
# contradiction). The KB only flags; it never judges which it is.
config :loopctl, :knowledge_conflict_threshold, 0.93
# Max `:relates_to`→`:potential_conflict` promotions the nightly lint sweep does per
# tenant per run (bounds the existing-corpus backfill; it cycles over nights).
config :loopctl, :knowledge_lint_max_conflict_promotions, 500

# DI: WebAuthn adapter — defaults to Wax (overridden in test env)
config :loopctl, :webauthn_adapter, Loopctl.WebAuthn.Wax

Expand Down
14 changes: 12 additions & 2 deletions lib/loopctl/knowledge/article_link.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ defmodule Loopctl.Knowledge.ArticleLink do
- `tenant_id` -- FK to tenants (set programmatically, never cast)
- `source_article_id` -- FK to articles (the origin of the link)
- `target_article_id` -- FK to articles (the destination of the link)
- `relationship_type` -- enum: relates_to, derived_from, contradicts, supersedes
- `relationship_type` -- enum: relates_to, derived_from, contradicts, supersedes,
potential_conflict (a MECHANICAL "too similar to comfortably coexist" flag —
NOT an assertion that the two disagree; the consuming agent judges whether it's a
redundancy to merge or a real contradiction. Distinct from `contradicts`, which
asserts a known disagreement.)
- `metadata` -- extensible JSONB, defaults to `%{}`
- `inserted_at` -- creation timestamp (no updated_at)

Expand All @@ -29,7 +33,13 @@ defmodule Loopctl.Knowledge.ArticleLink do

@type t :: %__MODULE__{}

@relationship_type_values [:relates_to, :derived_from, :contradicts, :supersedes]
@relationship_type_values [
:relates_to,
:derived_from,
:contradicts,
:supersedes,
:potential_conflict
]

schema "article_links" do
tenant_field()
Expand Down
54 changes: 37 additions & 17 deletions lib/loopctl/workers/article_linking_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -129,28 +129,47 @@ defmodule Loopctl.Workers.ArticleLinkingWorker do
log_if_exceeds_limit(article, tenant_id, max_comparisons)

candidates = find_similar_articles(article, tenant_id, threshold, max_comparisons)
existing_pairs = get_existing_link_pairs(article.id, tenant_id)

new_links =
candidates
|> Enum.reject(fn %{id: cid} ->
MapSet.member?(existing_pairs, {article.id, cid}) or
MapSet.member?(existing_pairs, {cid, article.id})
end)
|> Enum.map(fn %{id: target_id, similarity: score} ->
%{
source_article_id: article.id,
target_article_id: target_id,
relationship_type: :relates_to,
metadata: %{"auto_generated" => true, "similarity_score" => score}
}
conflict_threshold = conflict_threshold()

# A `relates_to` ambient link for everything >= the link threshold, PLUS a
# `:potential_conflict` flag (route-the-findings #4) for pairs >= the conflict
# threshold — too similar to comfortably coexist, for the consumer to resolve.
# Dedup is type-aware so the two link types don't crowd each other out.
relates =
build_links(article.id, candidates, tenant_id, :relates_to, fn _sim -> true end)

conflicts =
build_links(article.id, candidates, tenant_id, :potential_conflict, fn sim ->
sim >= conflict_threshold
end)

created_count = create_links(new_links, tenant_id)
created_count = create_links(relates ++ conflicts, tenant_id)
log_audit_event(article.id, tenant_id, created_count)
:ok
end

defp build_links(article_id, candidates, tenant_id, type, keep?) do
existing = get_existing_link_pairs(article_id, tenant_id, type)

candidates
|> Enum.filter(fn %{similarity: sim} -> keep?.(sim) end)
|> Enum.reject(fn %{id: cid} ->
MapSet.member?(existing, {article_id, cid}) or MapSet.member?(existing, {cid, article_id})
end)
|> Enum.map(fn %{id: target_id, similarity: score} ->
%{
source_article_id: article_id,
target_article_id: target_id,
relationship_type: type,
metadata: %{"auto_generated" => true, "similarity_score" => score}
}
end)
end

defp conflict_threshold do
Application.get_env(:loopctl, :knowledge_conflict_threshold, 0.93)
end

# US-27.7a: route the similarity lookup through the shared, scale-tested kNN helper
# (`Loopctl.Knowledge.VectorSearch.nearest/4`) on the dedicated HeavyRead pool instead
# of a bespoke `AdminRepo` cosine query. The helper's index-correct shape guarantees a
Expand Down Expand Up @@ -224,9 +243,10 @@ defmodule Loopctl.Workers.ArticleLinkingWorker do
where(query, [a], is_nil(a.project_id) or a.project_id == ^project_id)
end

defp get_existing_link_pairs(article_id, tenant_id) do
defp get_existing_link_pairs(article_id, tenant_id, type) do
from(l in ArticleLink,
where: l.tenant_id == ^tenant_id,
where: l.relationship_type == ^type,
where: l.source_article_id == ^article_id or l.target_article_id == ^article_id,
select: {l.source_article_id, l.target_article_id}
)
Expand Down
78 changes: 74 additions & 4 deletions lib/loopctl/workers/knowledge_lint_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
alias Loopctl.Audit
alias Loopctl.Knowledge
alias Loopctl.Knowledge.Article
alias Loopctl.Knowledge.ArticleLink
alias Loopctl.Tenants.Tenant
alias Loopctl.Workers.ArticleEmbeddingWorker
alias Loopctl.Workers.ArticleLinkingWorker
Expand All @@ -71,6 +72,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
# 0.6 leaves them isolated forever. Re-link orphans at a LOWER threshold so an
# isolated article connects to its closest relative rather than dangling.
@default_orphan_link_threshold 0.5
@default_max_conflict_promotions 500

@impl Oban.Worker
def perform(%Oban.Job{args: %{"mode" => "all_tenants"}}) do
Expand All @@ -91,11 +93,13 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
{:ok, report} = Knowledge.lint(tenant_id, max_per_category: @lint_max_per_category)

action = act_on_orphans(tenant_id, report)
log_audit_event(tenant_id, report, action)
promoted = promote_conflicts(tenant_id)
log_audit_event(tenant_id, report, action, promoted)

Logger.info(
"KnowledgeLintWorker: tenant=#{tenant_id} issues=#{report.summary.total_issues} " <>
"orphans_relinked=#{action.relinked} orphans_embedding_enqueued=#{action.embedding_enqueued}"
"orphans_relinked=#{action.relinked} orphans_embedding_enqueued=#{action.embedding_enqueued} " <>
"conflicts_promoted=#{promoted}"
)

:ok
Expand Down Expand Up @@ -150,6 +154,71 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
%{relinked: length(with_embedding), embedding_enqueued: length(without_embedding)}
end

# Route-the-findings (#4), existing-corpus backstop: the auto-linker stored the
# cosine `similarity_score` on every ambient `:relates_to` link. Any such link at or
# above the conflict threshold is a "too similar to coexist" pair that predates the
# forward detection — promote it to also carry a `:potential_conflict` flag (no new
# embedding calls). Bounded per run; cycles the corpus over nights. Idempotent: a
# pair already flagged (either direction) is skipped.
defp promote_conflicts(tenant_id) do
threshold = Application.get_env(:loopctl, :knowledge_conflict_threshold, 0.93)

cap =
Application.get_env(
:loopctl,
:knowledge_lint_max_conflict_promotions,
@default_max_conflict_promotions
)

candidates =
from(l in ArticleLink,
as: :rel,
where: l.tenant_id == ^tenant_id,
where: l.relationship_type == :relates_to,
where: fragment("(?->>'similarity_score')::float >= ?", l.metadata, ^threshold),
where:
not exists(
from(pc in ArticleLink,
where:
pc.tenant_id == parent_as(:rel).tenant_id and
pc.relationship_type == :potential_conflict and
((pc.source_article_id == parent_as(:rel).source_article_id and
pc.target_article_id == parent_as(:rel).target_article_id) or
(pc.source_article_id == parent_as(:rel).target_article_id and
pc.target_article_id == parent_as(:rel).source_article_id))
)
),
select: %{
source_article_id: l.source_article_id,
target_article_id: l.target_article_id,
metadata: l.metadata
},
limit: ^cap
)
|> AdminRepo.all()

Enum.reduce(candidates, 0, fn c, count ->
attrs = %{
source_article_id: c.source_article_id,
target_article_id: c.target_article_id,
relationship_type: :potential_conflict,
metadata: %{
"auto_generated" => true,
"similarity_score" => c.metadata["similarity_score"],
"promoted_from" => "relates_to"
}
}

changeset = ArticleLink.changeset(%ArticleLink{tenant_id: tenant_id}, attrs)

case AdminRepo.insert(changeset) do
{:ok, _} -> count + 1
# Lost a race / already exists — skip, stay idempotent.
{:error, _} -> count
end
end)
end

defp embedded_ids(_tenant_id, []), do: MapSet.new()

defp embedded_ids(tenant_id, orphan_ids) do
Expand All @@ -163,7 +232,7 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
|> MapSet.new()
end

defp log_audit_event(tenant_id, report, action) do
defp log_audit_event(tenant_id, report, action, promoted) do
Audit.create_log_entry(tenant_id, %{
entity_type: "knowledge_lint",
entity_id: tenant_id,
Expand All @@ -174,7 +243,8 @@ defmodule Loopctl.Workers.KnowledgeLintWorker do
new_state: %{
"summary" => report.summary,
"orphans_relinked" => action.relinked,
"orphans_embedding_enqueued" => action.embedding_enqueued
"orphans_embedding_enqueued" => action.embedding_enqueued,
"conflicts_promoted" => promoted
}
})
end
Expand Down
62 changes: 59 additions & 3 deletions test/loopctl/workers/article_linking_worker_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,56 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do
List.duplicate(1.0, 768) ++ List.duplicate(1.518, 768)
end

# cos(similar_embedding, this) = 768 / (sqrt(768) * sqrt(1536)) ~= 0.707 — related
# (>= 0.6) but below the 0.93 conflict threshold.
defp moderately_similar_embedding do
List.duplicate(1.0, 1536)
end

defp links_of_type(tenant_id, a_id, b_id, type) do
from(l in ArticleLink,
where: l.tenant_id == ^tenant_id,
where: l.relationship_type == ^type,
where:
(l.source_article_id == ^a_id and l.target_article_id == ^b_id) or
(l.source_article_id == ^b_id and l.target_article_id == ^a_id)
)
|> AdminRepo.all()
end

describe "potential conflict detection (#4)" do
test "flags a near-identical pair with a :potential_conflict link" do
%{tenant: tenant} = setup_tenant()
source = create_article_with_embedding(tenant.id, similar_embedding())
dup = create_article_with_embedding(tenant.id, near_similar_embedding())

assert :ok =
ArticleLinkingWorker.perform(%Oban.Job{
args: %{"article_id" => source.id, "tenant_id" => tenant.id}
})

# Both an ambient relates_to AND the conflict flag.
assert [_] = links_of_type(tenant.id, source.id, dup.id, :relates_to)
assert [conflict] = links_of_type(tenant.id, source.id, dup.id, :potential_conflict)
assert conflict.metadata["similarity_score"] >= 0.93
assert conflict.metadata["auto_generated"] == true
end

test "a merely-related pair (below the conflict threshold) gets NO conflict flag" do
%{tenant: tenant} = setup_tenant()
source = create_article_with_embedding(tenant.id, similar_embedding())
related = create_article_with_embedding(tenant.id, moderately_similar_embedding())

assert :ok =
ArticleLinkingWorker.perform(%Oban.Job{
args: %{"article_id" => source.id, "tenant_id" => tenant.id}
})

assert [_] = links_of_type(tenant.id, source.id, related.id, :relates_to)
assert [] == links_of_type(tenant.id, source.id, related.id, :potential_conflict)
end
end

# --- TC-21.2.1: Creates relates_to links for similar articles ---

describe "perform/1 creates links" do
Expand Down Expand Up @@ -226,13 +276,15 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do
links =
from(l in ArticleLink,
where: l.tenant_id == ^tenant.id,
where: l.relationship_type == :relates_to,
where:
(l.source_article_id == ^article_a.id and l.target_article_id == ^article_b.id) or
(l.source_article_id == ^article_b.id and l.target_article_id == ^article_a.id)
)
|> AdminRepo.all()

# Only the manually created one should exist
# Only the manually created relates_to one should exist (a high-similarity pair
# also gets a separate :potential_conflict flag — that's #4, asserted elsewhere).
assert length(links) == 1
end
end
Expand Down Expand Up @@ -301,10 +353,12 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do
args: %{"article_id" => source.id, "tenant_id" => tenant.id}
})

# All 3 similar articles should be linked (within default limit of 50)
# All 3 similar articles should be linked (within default limit of 50). Scope to
# :relates_to — high-similarity pairs also get a :potential_conflict link (#4).
link_count =
from(l in ArticleLink,
where: l.tenant_id == ^tenant.id,
where: l.relationship_type == :relates_to,
where: l.source_article_id == ^source.id
)
|> AdminRepo.aggregate(:count)
Expand Down Expand Up @@ -419,7 +473,9 @@ defmodule Loopctl.Workers.ArticleLinkingWorkerTest do
assert audit.actor_type == "system"
assert audit.actor_label == "worker:article_linking"
assert audit.new_state["article_id"] == source.id
assert audit.new_state["new_link_count"] == 1
# 2 links: a :relates_to and, since the pair is near-identical (>= conflict
# threshold), a :potential_conflict flag (#4) — both are auto-links created here.
assert audit.new_state["new_link_count"] == 2
end
end

Expand Down
Loading
Loading