EREGCSC-2767 Fix grouping post-save hook (#1440)

* fix: grouping algorithm behavior in a multigroup or removal scenario * on group remove, use related_resources set to compute all affected resources & groups. - previously relied only on current groups assigned to the resource, was inaccurate. * fix group_parent field update behavior * fix unusual bug where queryset is emptied halfway through the algorithm * provide more comprehensive unit tests to ensure all behaviors work, including: - adding one or more group(s) - remove one or more group(s) - remove a group and assign a new group in the same save operation - ensure group parents are computed properly for all operations * linting * test: add test for modifying group directly * fix: one-time data migration for resources to assign group params
Enterprise-CMCS · Oct 9, 2024 · e1aae0f · e1aae0f
1 parent a77ce0f
commit e1aae0f
Show file tree

Hide file tree

Showing 3 changed files with 337 additions and 83 deletions.
diff --git a/solution/backend/resources/migrations/0006_recompute_groups.py b/solution/backend/resources/migrations/0006_recompute_groups.py
@@ -0,0 +1,58 @@
+from django.db import migrations, models
+from django.db.models import Value, Q
+from django.contrib.postgres.aggregates import ArrayAgg
+
+
+TIMEOUT_MINUTES = 10
+
+
+def generate_related(apps, schema_editor):
+    schema_editor.execute(f"SET LOCAL statement_timeout TO {TIMEOUT_MINUTES * 60000};")
+    AbstractResource = apps.get_model("resources", "AbstractResource")
+    ResourceGroup = apps.get_model("resources", "ResourceGroup")
+
+    for resource in AbstractResource.objects.all():
+        new_groups = resource.resource_groups.all()
+        q = Q(pk__in=new_groups) | Q(resources__in=resource.related_resources.all())
+        all_groups = list(ResourceGroup.objects.filter(q).distinct().values_list("pk", flat=True))
+        all_resources = list(AbstractResource.objects.filter(resource_groups__in=all_groups).distinct().values_list("pk", flat=True))
+
+        # Set the parent of each affected group (most recent by date at the top of each group's hierarchy)
+        AbstractResource.objects.filter(pk__in=all_resources).update(group_parent=False)
+        for group in ResourceGroup.objects.filter(pk__in=all_groups):
+            group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True)
+
+        # Compute the related resources, citations, categories, and subjects
+        # Except for related_resources, these lists are inclusive of the resource we are processing
+        if new_groups:
+            related_resources = AbstractResource.objects.filter(resource_groups__in=new_groups)
+            related_aggregates = related_resources.aggregate(
+                all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])),
+                all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])),
+                all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])),
+            )
+            related_resources = related_resources.exclude(pk=resource.pk)  # Exclude the current resource for related_resources
+
+            # Set related_X fields for this resource
+            resource.related_resources.set(related_resources)
+            resource.related_citations.set(related_aggregates["all_citations"])
+            resource.related_categories.set(related_aggregates["all_categories"])
+            resource.related_subjects.set(related_aggregates["all_subjects"])
+        else:
+            # Set related_X to contain only the X objects in the individual resource.
+            # We must do this for filtering purposes when `group_resources=true` on resource endpoints.
+            resource.related_resources.clear()
+            resource.related_citations.set(resource.cfr_citations.all())
+            resource.related_categories.set([resource.category] if resource.category else [])
+            resource.related_subjects.set(resource.subjects.all())
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('resources', '0005_reset_extract_url'),
+    ]
+
+    operations = [
+        migrations.RunPython(generate_related, reverse_code=migrations.RunPython.noop),
+    ]
diff --git a/solution/backend/resources/models/groups.py b/solution/backend/resources/models/groups.py
@@ -52,50 +52,58 @@ class Meta:
 
 
 def update_related_resources(resource, first=True):
-    groups = resource.resource_groups.all()
+    # Compute the difference between old groups and new groups and process all affected resources.
+    # This takes care of the case where the resource was previously in a group, but is now not in that group.
+    # We must ensure related fields and parents are updated for all related_resources that currently exist.
+    #
+    # Note that all_groups and all_resources are cast to lists of pks to avoid issues with the queryset being erased
+    # later in this function for reasons unknown.
+    new_groups = resource.resource_groups.all()
+    q = Q(pk__in=new_groups) | Q(resources__in=resource.related_resources.all())
+    all_groups = list(ResourceGroup.objects.filter(q).distinct().values_list("pk", flat=True))
+    all_resources = list(AbstractResource.objects.filter(resource_groups__in=all_groups).distinct().values_list("pk", flat=True))
 
-    if not groups:
-        # This resource does not belong to any groups, so we will treat it differently.
+    if first:
+        # Set the parent of each affected group (most recent by date at the top of each group's hierarchy)
+        AbstractResource.objects.filter(pk__in=all_resources).update(group_parent=False)
+        for group in ResourceGroup.objects.filter(pk__in=all_groups):
+            group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True)
+
+    # Compute the related resources, citations, categories, and subjects
+    # Except for related_resources, these lists are inclusive of the resource we are processing
+    if new_groups:
+        related_resources = AbstractResource.objects.filter(resource_groups__in=new_groups)
+        related_aggregates = related_resources.aggregate(
+            all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])),
+            all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])),
+            all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])),
+        )
+        related_resources = related_resources.exclude(pk=resource.pk)  # Exclude the current resource for related_resources
+
+        # Set related_X fields for this resource
+        resource.related_resources.set(related_resources)
+        resource.related_citations.set(related_aggregates["all_citations"])
+        resource.related_categories.set(related_aggregates["all_categories"])
+        resource.related_subjects.set(related_aggregates["all_subjects"])
+    else:
         # Set related_X to contain only the X objects in the individual resource.
         # We must do this for filtering purposes when `group_resources=true` on resource endpoints.
         resource.related_resources.clear()
         resource.related_citations.set(resource.cfr_citations.all())
         resource.related_categories.set([resource.category] if resource.category else [])
         resource.related_subjects.set(resource.subjects.all())
-        return
-
-    if first:
-        # Set the parent of each group (most recent by date at the top of each group's hierarchy)
-        AbstractResource.objects.filter(resource_groups__in=groups).update(group_parent=False)
-        for group in groups:
-            group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True)
-
-    # Compute the related resources, citations, categories, and subjects
-    # Except for related_resources, these lists are inclusive of the resource we are processing
-    related_resources = AbstractResource.objects.filter(resource_groups__in=groups)
-    related_aggregates = related_resources.aggregate(
-        all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])),
-        all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])),
-        all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])),
-    )
-    related_resources = related_resources.exclude(pk=resource.pk)  # Exclude the current resource for related_resources
-
-    # Set related_X for this resource
-    resource.related_resources.set(related_resources)
-    resource.related_citations.set(related_aggregates["all_citations"])
-    resource.related_categories.set(related_aggregates["all_categories"])
-    resource.related_subjects.set(related_aggregates["all_subjects"])
 
     if first:
-        # For the first resource processed, also update all of its related_resources
-        for related_resource in related_resources:
-            update_related_resources(related_resource, False)
+        # For the first resource processed, also update all affected resources (new and old)
+        for affected in AbstractResource.objects.filter(pk__in=all_resources):
+            update_related_resources(affected, False)
 
 
 @receiver(post_save, sender=ResourceGroup)
 def update_resource_group(sender, instance, **kwargs):
-    for resource in instance.resources.all():
-        update_related_resources(resource)
+    resources = instance.resources.all()
+    if resources:
+        update_related_resources(resources.first())
 
 
 @receiver(post_save, sender=AbstractResource)