diff --git a/documentcloud/addons/migrations/0030_addonevent_site_indexes.py b/documentcloud/addons/migrations/0030_addonevent_site_indexes.py new file mode 100644 index 00000000..3cd8725a --- /dev/null +++ b/documentcloud/addons/migrations/0030_addonevent_site_indexes.py @@ -0,0 +1,47 @@ +# Generated by Django 4.2.2 on 2026-06-08 + +from django.contrib.postgres.operations import ( + AddIndexConcurrently, + RemoveIndexConcurrently, +) +from django.db import migrations, models +import django.db.models.fields.json +import django.db.models.functions.text +import documentcloud.addons.models + + +class Migration(migrations.Migration): + + atomic = False + + dependencies = [ + ("addons", "0029_addonrun_data"), + ] + + operations = [ + # index the `site` origin (scheme + host) for the `domain` filter + AddIndexConcurrently( + model_name="addonevent", + index=models.Index( + documentcloud.addons.models.SiteOrigin("parameters"), + condition=models.Q(("parameters__has_key", "site")), + name="addonevent_site_origin_idx", + ), + ), + # rebuild the `site` index as UPPER(...) so the case-insensitive + # (iexact) `site` filter can use it + RemoveIndexConcurrently( + model_name="addonevent", + name="addonevent_param_site_idx", + ), + AddIndexConcurrently( + model_name="addonevent", + index=models.Index( + django.db.models.functions.text.Upper( + django.db.models.fields.json.KeyTextTransform("site", "parameters") + ), + condition=models.Q(("parameters__has_key", "site")), + name="addonevent_param_site_idx", + ), + ), + ] diff --git a/documentcloud/addons/models.py b/documentcloud/addons/models.py index 71884a8e..0af4ca61 100644 --- a/documentcloud/addons/models.py +++ b/documentcloud/addons/models.py @@ -2,7 +2,9 @@ from django.conf import settings from django.core.cache import cache from django.db import models, transaction -from django.db.models import F, Q +from django.db.models import Func, Q +from django.db.models.fields.json import KeyTextTransform +from django.db.models.functions import Upper from django.utils.translation import gettext_lazy as _ # Standard Library @@ -34,6 +36,21 @@ logger = logging.getLogger(__name__) +class SiteOrigin(Func): + # pylint: disable=abstract-method + """Extract the lowercased origin (scheme + host) from a JSON `site` URL. + + Captures the scheme and authority of the stored URL, stopping at the path, + query or fragment, so `https://www.nifc.gov/foo` yields `https://www.nifc.gov`. + Used both to build the expression index on `AddOnEvent` and to filter by it, + so the index and the query share one definition and cannot drift apart. All + component functions are IMMUTABLE, so this is safe to index. + """ + + template = "LOWER(SUBSTRING(%(expressions)s ->> 'site' FROM '^(https?://[^/?#]+)'))" + output_field = models.TextField() + + class AddOn(models.Model): objects = AddOnQuerySet.as_manager() @@ -579,10 +596,17 @@ class AddOnEvent(models.Model): class Meta: indexes = [ models.Index( - F("parameters__site"), + # matches the `UPPER(parameters ->> 'site')` the `site` filter's + # iexact lookup emits, so a case-insensitive match can use it + Upper(KeyTextTransform("site", "parameters")), name="addonevent_param_site_idx", condition=Q(parameters__has_key="site"), ), + models.Index( + SiteOrigin("parameters"), + name="addonevent_site_origin_idx", + condition=Q(parameters__has_key="site"), + ), ] def __str__(self): diff --git a/documentcloud/addons/tests/test_views.py b/documentcloud/addons/tests/test_views.py index 21a285f4..54d66894 100644 --- a/documentcloud/addons/tests/test_views.py +++ b/documentcloud/addons/tests/test_views.py @@ -309,6 +309,48 @@ def test_filter_site_absent_is_noop(self, client): assert response.status_code == status.HTTP_200_OK assert len(response.json()["results"]) == 3 + def test_filter_domain(self, client): + """Filter runs by the origin of the event's parameters.site""" + user = UserFactory() + matching_event = AddOnEventFactory( + user=user, + parameters={ + "site": "https://www.nifc.gov/fire-information/statistics/wildfires", + "selector": "*", + }, + ) + other_event = AddOnEventFactory( + user=user, parameters={"site": "https://www.other.com/path"} + ) + no_site_event = AddOnEventFactory(user=user, parameters={"selector": "*"}) + matching_run = AddOnRunFactory(user=user, event=matching_event) + AddOnRunFactory(user=user, event=other_event) + AddOnRunFactory(user=user, event=no_site_event) + AddOnRunFactory(user=user, event=None) + client.force_authenticate(user=user) + # the origin matches regardless of path + response = client.get("/api/addon_runs/", {"domain": "https://www.nifc.gov"}) + assert response.status_code == status.HTTP_200_OK + uuids = [r["uuid"] for r in response.json()["results"]] + assert uuids == [str(matching_run.uuid)] + # a bare host and a mismatched scheme do not match + for domain in ("www.nifc.gov", "http://www.nifc.gov"): + response = client.get("/api/addon_runs/", {"domain": domain}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["results"] == [], domain + + def test_filter_domain_no_partial_host_match(self, client): + """The domain filter matches whole origins, not substrings""" + user = UserFactory() + event = AddOnEventFactory( + user=user, parameters={"site": "https://www.nifc.gov.evil.com/path"} + ) + AddOnRunFactory(user=user, event=event) + client.force_authenticate(user=user) + response = client.get("/api/addon_runs/", {"domain": "https://www.nifc.gov"}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["results"] == [] + @pytest.mark.django_db() class TestAddOnEventAPI: @@ -351,6 +393,29 @@ def test_filter_site_no_match(self, client): assert response.status_code == status.HTTP_200_OK assert response.json()["results"] == [] + def test_filter_domain(self, client): + """Filter events by the origin of their parameters.site""" + user = UserFactory() + matching = AddOnEventFactory( + user=user, + parameters={ + "site": "https://www.nifc.gov/fire-information/statistics/wildfires", + "selector": "*", + }, + ) + AddOnEventFactory(user=user, parameters={"site": "https://www.other.com/path"}) + AddOnEventFactory(user=user, parameters={"selector": "*"}) + client.force_authenticate(user=user) + response = client.get("/api/addon_events/", {"domain": "https://www.nifc.gov"}) + assert response.status_code == status.HTTP_200_OK + ids = [r["id"] for r in response.json()["results"]] + assert ids == [matching.pk] + # a bare host and a mismatched scheme do not match + for domain in ("www.nifc.gov", "http://www.nifc.gov"): + response = client.get("/api/addon_events/", {"domain": domain}) + assert response.status_code == status.HTTP_200_OK + assert response.json()["results"] == [], domain + def test_filter_message(self, client): """Filter runs by message""" user = UserFactory() diff --git a/documentcloud/addons/views.py b/documentcloud/addons/views.py index acd03fd0..375dd03b 100644 --- a/documentcloud/addons/views.py +++ b/documentcloud/addons/views.py @@ -51,6 +51,7 @@ AddOnRun, GitHubAccount, GitHubInstallation, + SiteOrigin, VisualAddOn, ) from documentcloud.addons.serializers import ( @@ -65,6 +66,20 @@ logger = logging.getLogger(__name__) +def domain_to_origin(value): + """Normalize a domain filter value to a lowercased origin (scheme + host). + + Expects an origin such as `https://www.nifc.gov`; a value without a scheme + will not match. A full URL is accepted and its path is discarded. Returns + None if no scheme + host can be parsed, lowercased to match the `SiteOrigin` + expression index. + """ + parsed = furl(value.strip()) + if not parsed.scheme or not parsed.host: + return None + return f"{parsed.scheme}://{parsed.netloc}".lower() + + class AddOnViewSet(viewsets.ModelViewSet): serializer_class = AddOnSerializer queryset = AddOn.objects.none() @@ -741,11 +756,18 @@ class Filter(django_filters.FilterSet): ) dismissed = django_filters.BooleanFilter(help_text="Was this run dismissed?") site = django_filters.CharFilter( - field_name="event__parameters__site", - lookup_expr="iexact", + method="site_filter", label="Site", help_text="Filter runs by the `site` value in the event's parameters.", ) + domain = django_filters.CharFilter( + method="domain_filter", + label="Domain", + help_text=( + "Filter runs by the origin of the event's `site` parameter, e.g. " + "`https://www.nifc.gov`." + ), + ) message = django_filters.CharFilter( field_name="message", lookup_expr="exact", @@ -753,6 +775,28 @@ class Filter(django_filters.FilterSet): help_text="Filter runs by their progress message.", ) + def site_filter(self, queryset, name, value): + # pylint: disable=unused-argument + # the has_key clause lets the partial `addonevent_param_site_idx` + # index on AddOnEvent serve the case-insensitive match + return queryset.filter( + event__parameters__has_key="site", + event__parameters__site__iexact=value, + ) + + def domain_filter(self, queryset, name, value): + # pylint: disable=unused-argument + origin = domain_to_origin(value) + if origin is None: + return queryset.none() + # filter on has_key + the SiteOrigin expression so the partial + # `addonevent_site_origin_idx` index on AddOnEvent is usable + return ( + queryset.filter(event__parameters__has_key="site") + .annotate(_site_origin=SiteOrigin("event__parameters")) + .filter(_site_origin=origin) + ) + class Meta: model = AddOnRun fields = { @@ -983,11 +1027,40 @@ class Filter(django_filters.FilterSet): help_text="Filter events by a specific add-on ID.", ) site = django_filters.CharFilter( - field_name="parameters__site", - lookup_expr="iexact", + method="site_filter", label="Site", help_text="Filter events by the `site` value in their parameters.", ) + domain = django_filters.CharFilter( + method="domain_filter", + label="Domain", + help_text=( + "Filter events by the origin of their `site` parameter, e.g. " + "`https://www.nifc.gov`." + ), + ) + + def site_filter(self, queryset, name, value): + # pylint: disable=unused-argument + # the has_key clause lets the partial `addonevent_param_site_idx` + # index serve the case-insensitive match + return queryset.filter( + parameters__has_key="site", + parameters__site__iexact=value, + ) + + def domain_filter(self, queryset, name, value): + # pylint: disable=unused-argument + origin = domain_to_origin(value) + if origin is None: + return queryset.none() + # filter on has_key + the SiteOrigin expression so the partial + # `addonevent_site_origin_idx` index is usable + return ( + queryset.filter(parameters__has_key="site") + .annotate(_site_origin=SiteOrigin("parameters")) + .filter(_site_origin=origin) + ) class Meta: model = AddOnEvent