From e1aae0fefba7b2006e7fffffebe56b7592ba6273 Mon Sep 17 00:00:00 2001 From: Caleb Godwin <75450124+cgodwin1@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:50:43 -0400 Subject: [PATCH] EREGCSC-2767 Fix grouping post-save hook (#1440) * fix: grouping algorithm behavior in a multigroup or removal scenario * on group remove, use related_resources set to compute all affected resources & groups. - previously relied only on current groups assigned to the resource, was inaccurate. * fix group_parent field update behavior * fix unusual bug where queryset is emptied halfway through the algorithm * provide more comprehensive unit tests to ensure all behaviors work, including: - adding one or more group(s) - remove one or more group(s) - remove a group and assign a new group in the same save operation - ensure group parents are computed properly for all operations * linting * test: add test for modifying group directly * fix: one-time data migration for resources to assign group params --- .../migrations/0006_recompute_groups.py | 58 ++++ solution/backend/resources/models/groups.py | 70 +++-- .../resources/tests/test_resource_grouping.py | 292 ++++++++++++++---- 3 files changed, 337 insertions(+), 83 deletions(-) create mode 100644 solution/backend/resources/migrations/0006_recompute_groups.py diff --git a/solution/backend/resources/migrations/0006_recompute_groups.py b/solution/backend/resources/migrations/0006_recompute_groups.py new file mode 100644 index 000000000..ec022be1a --- /dev/null +++ b/solution/backend/resources/migrations/0006_recompute_groups.py @@ -0,0 +1,58 @@ +from django.db import migrations, models +from django.db.models import Value, Q +from django.contrib.postgres.aggregates import ArrayAgg + + +TIMEOUT_MINUTES = 10 + + +def generate_related(apps, schema_editor): + schema_editor.execute(f"SET LOCAL statement_timeout TO {TIMEOUT_MINUTES * 60000};") + AbstractResource = apps.get_model("resources", "AbstractResource") + ResourceGroup = apps.get_model("resources", "ResourceGroup") + + for resource in AbstractResource.objects.all(): + new_groups = resource.resource_groups.all() + q = Q(pk__in=new_groups) | Q(resources__in=resource.related_resources.all()) + all_groups = list(ResourceGroup.objects.filter(q).distinct().values_list("pk", flat=True)) + all_resources = list(AbstractResource.objects.filter(resource_groups__in=all_groups).distinct().values_list("pk", flat=True)) + + # Set the parent of each affected group (most recent by date at the top of each group's hierarchy) + AbstractResource.objects.filter(pk__in=all_resources).update(group_parent=False) + for group in ResourceGroup.objects.filter(pk__in=all_groups): + group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True) + + # Compute the related resources, citations, categories, and subjects + # Except for related_resources, these lists are inclusive of the resource we are processing + if new_groups: + related_resources = AbstractResource.objects.filter(resource_groups__in=new_groups) + related_aggregates = related_resources.aggregate( + all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])), + all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])), + all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])), + ) + related_resources = related_resources.exclude(pk=resource.pk) # Exclude the current resource for related_resources + + # Set related_X fields for this resource + resource.related_resources.set(related_resources) + resource.related_citations.set(related_aggregates["all_citations"]) + resource.related_categories.set(related_aggregates["all_categories"]) + resource.related_subjects.set(related_aggregates["all_subjects"]) + else: + # Set related_X to contain only the X objects in the individual resource. + # We must do this for filtering purposes when `group_resources=true` on resource endpoints. + resource.related_resources.clear() + resource.related_citations.set(resource.cfr_citations.all()) + resource.related_categories.set([resource.category] if resource.category else []) + resource.related_subjects.set(resource.subjects.all()) + + +class Migration(migrations.Migration): + + dependencies = [ + ('resources', '0005_reset_extract_url'), + ] + + operations = [ + migrations.RunPython(generate_related, reverse_code=migrations.RunPython.noop), + ] diff --git a/solution/backend/resources/models/groups.py b/solution/backend/resources/models/groups.py index 2fe840dd7..1b20cfa37 100644 --- a/solution/backend/resources/models/groups.py +++ b/solution/backend/resources/models/groups.py @@ -52,50 +52,58 @@ class Meta: def update_related_resources(resource, first=True): - groups = resource.resource_groups.all() + # Compute the difference between old groups and new groups and process all affected resources. + # This takes care of the case where the resource was previously in a group, but is now not in that group. + # We must ensure related fields and parents are updated for all related_resources that currently exist. + # + # Note that all_groups and all_resources are cast to lists of pks to avoid issues with the queryset being erased + # later in this function for reasons unknown. + new_groups = resource.resource_groups.all() + q = Q(pk__in=new_groups) | Q(resources__in=resource.related_resources.all()) + all_groups = list(ResourceGroup.objects.filter(q).distinct().values_list("pk", flat=True)) + all_resources = list(AbstractResource.objects.filter(resource_groups__in=all_groups).distinct().values_list("pk", flat=True)) - if not groups: - # This resource does not belong to any groups, so we will treat it differently. + if first: + # Set the parent of each affected group (most recent by date at the top of each group's hierarchy) + AbstractResource.objects.filter(pk__in=all_resources).update(group_parent=False) + for group in ResourceGroup.objects.filter(pk__in=all_groups): + group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True) + + # Compute the related resources, citations, categories, and subjects + # Except for related_resources, these lists are inclusive of the resource we are processing + if new_groups: + related_resources = AbstractResource.objects.filter(resource_groups__in=new_groups) + related_aggregates = related_resources.aggregate( + all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])), + all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])), + all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])), + ) + related_resources = related_resources.exclude(pk=resource.pk) # Exclude the current resource for related_resources + + # Set related_X fields for this resource + resource.related_resources.set(related_resources) + resource.related_citations.set(related_aggregates["all_citations"]) + resource.related_categories.set(related_aggregates["all_categories"]) + resource.related_subjects.set(related_aggregates["all_subjects"]) + else: # Set related_X to contain only the X objects in the individual resource. # We must do this for filtering purposes when `group_resources=true` on resource endpoints. resource.related_resources.clear() resource.related_citations.set(resource.cfr_citations.all()) resource.related_categories.set([resource.category] if resource.category else []) resource.related_subjects.set(resource.subjects.all()) - return - - if first: - # Set the parent of each group (most recent by date at the top of each group's hierarchy) - AbstractResource.objects.filter(resource_groups__in=groups).update(group_parent=False) - for group in groups: - group.resources.filter(pk=group.resources.order_by("-date").first().pk).update(group_parent=True) - - # Compute the related resources, citations, categories, and subjects - # Except for related_resources, these lists are inclusive of the resource we are processing - related_resources = AbstractResource.objects.filter(resource_groups__in=groups) - related_aggregates = related_resources.aggregate( - all_citations=ArrayAgg("cfr_citations", distinct=True, filter=Q(cfr_citations__isnull=False), default=Value([])), - all_categories=ArrayAgg("category", distinct=True, filter=Q(category__isnull=False), default=Value([])), - all_subjects=ArrayAgg("subjects", distinct=True, filter=Q(subjects__isnull=False), default=Value([])), - ) - related_resources = related_resources.exclude(pk=resource.pk) # Exclude the current resource for related_resources - - # Set related_X for this resource - resource.related_resources.set(related_resources) - resource.related_citations.set(related_aggregates["all_citations"]) - resource.related_categories.set(related_aggregates["all_categories"]) - resource.related_subjects.set(related_aggregates["all_subjects"]) if first: - # For the first resource processed, also update all of its related_resources - for related_resource in related_resources: - update_related_resources(related_resource, False) + # For the first resource processed, also update all affected resources (new and old) + for affected in AbstractResource.objects.filter(pk__in=all_resources): + update_related_resources(affected, False) @receiver(post_save, sender=ResourceGroup) def update_resource_group(sender, instance, **kwargs): - for resource in instance.resources.all(): - update_related_resources(resource) + resources = instance.resources.all() + if resources: + update_related_resources(resources.first()) @receiver(post_save, sender=AbstractResource) diff --git a/solution/backend/resources/tests/test_resource_grouping.py b/solution/backend/resources/tests/test_resource_grouping.py index fe224b62a..44f66d3c5 100644 --- a/solution/backend/resources/tests/test_resource_grouping.py +++ b/solution/backend/resources/tests/test_resource_grouping.py @@ -45,9 +45,9 @@ def setUp(self): # cause duplicate key errors if the following line is not run. PublicCategory.objects.all().delete() groups = [ResourceGroup.objects.create(id=i) for i in range(3)] - categories = [PublicCategory.objects.create(id=i, name=f"{i}") for i in range(6)] - subjects = [Subject.objects.create(id=i) for i in range(6)] - citations = [Section.objects.create(id=i, title=i, part=i, section_id=i) for i in range(7)] + categories = [PublicCategory.objects.create(id=i, name=f"{i}") for i in range(7)] + subjects = [Subject.objects.create(id=i) for i in range(7)] + citations = [Section.objects.create(id=i, title=i, part=i, section_id=i) for i in range(8)] links = [FederalRegisterLink.objects.create( id=i, date=(datetime.now() - timedelta(i)).strftime('%Y-%m-%d'), @@ -71,9 +71,9 @@ def setUp(self): i.save() # Create a link with no group - link = FederalRegisterLink.objects.create(id=6, category=categories[0]) - link.cfr_citations.set([citations[0], citations[1]]) - link.subjects.set([subjects[0], subjects[1]]) + link = FederalRegisterLink.objects.create(id=6, category=categories[6]) + link.cfr_citations.set([citations[6], citations[7]]) + link.subjects.set([subjects[6]]) link.save() # Ensure only items 0 and 1 are computed to be group parents @@ -155,52 +155,240 @@ def test_no_group_related_fields(self): link = query.first() self.assertEqual(len(link.related_resources_set), 0) - self.assertCountEqual(link.related_categories_set, [0]) - self.assertCountEqual(link.related_subjects_set, [0, 1]) - self.assertCountEqual(link.related_citations_set, [0, 1]) + self.assertCountEqual(link.related_categories_set, [6]) + self.assertCountEqual(link.related_subjects_set, [6]) + self.assertCountEqual(link.related_citations_set, [6, 7]) - # TODO: this test should pass. - # - # Right now, removing a parent of a group from the group does not properly recompute the group parents. - # Also it might be expected that related_X fields are not properly computed as well in this case. + def test_update_group(self): + group = ResourceGroup.objects.get(id=0) + group.resources.set([2, 4]) + group.save() + + links = FederalRegisterLink.objects.filter(resource_groups__isnull=False).order_by("id").annotate( + related_resources_set=distinct_array_agg("related_resources"), + related_categories_set=distinct_array_agg("related_categories"), + related_subjects_set=distinct_array_agg("related_subjects"), + related_citations_set=distinct_array_agg("related_citations"), + ).distinct() + + [self.assertEqual(links[i].group_parent, True) for i in [0, 2, 1]] + [self.assertEqual(links[i].group_parent, False) for i in [3, 4, 5]] + + expected = [ + [3, 5], + [3], + [4], + [0, 5, 1], + [2], + [0, 3], + ] + + for i in range(len(links)): + self.assertCountEqual( + links[i].related_resources_set, + expected[i], + msg=f"related_resources for resource {i} is incorrect!", + ) + self.assertCountEqual( + links[i].related_categories_set, + expected[i] + [i], + msg=f"related_categories for resource {i} is incorrect!", + ) + self.assertCountEqual( + links[i].related_subjects_set, + expected[i] + [i], + msg=f"related_subjects for resource {i} is incorrect!", + ) + + # Test that a resource's related_X fields are cleared when removed from a group # - # This should only affect multigroup scenarios of which we currently have none on production, so - # fixing this can be a followup ticket. + # Test config looks like: + # { + # "to_update": items_pk, + # "new_groups": [new_group_pks], + # "updated_related_fields": { + # "related_resources": [expected_related_resource_pks], + # "related_categories": [expected_related_category_pks], + # "related_subjects": [expected_related_subject_pks], + # "related_citations": [expected_related_citation_pks], + # }, + # "parents_true": [expected_pks_where_parent_is_true], + # "parents_false": [expected_pks_where_parent_is_false], + # "grouped_related_fields": [ + # [expected_related_resource_pks_item_0], + # ..... + # [expected_related_resource_pks_item_5], + # ], + # } # - # Expected fix: rewrite the post-save hook to correctly compute parents and related_X fields for resources - # that are part of a group _OR_ have related_resources set. We currently only process the former. - - # # Test that a resource's related_X fields are cleared when removed from a group - # def test_group_removal(self): - # # Remove the first link's group assignments - # first = FederalRegisterLink.objects.order_by("id").first() - # first.resource_groups.clear() - # first.save() - - # # Get the first link again - # first = FederalRegisterLink.objects.order_by("id").annotate( - # related_resources_set=distinct_array_agg("related_resources"), - # related_categories_set=distinct_array_agg("related_categories"), - # related_subjects_set=distinct_array_agg("related_subjects"), - # related_citations_set=distinct_array_agg("related_citations"), - # ).distinct().first() - - # # Verify that its related_X fields are reset - # self.assertEqual(len(first.related_resources_set), 0) - # self.assertCountEqual(first.related_categories_set, [0]) - # self.assertCountEqual(first.related_subjects_set, [0]) - # self.assertCountEqual(first.related_citations_set, [0, 1]) - - # # Get the other (still grouped) links - # links = FederalRegisterLink.objects.filter(resource_groups__isnull=False).order_by("id").annotate( - # related_resources_set=distinct_array_agg("related_resources"), - # related_categories_set=distinct_array_agg("related_categories"), - # related_subjects_set=distinct_array_agg("related_subjects"), - # related_citations_set=distinct_array_agg("related_citations"), - # ).distinct() - - # # Verify group_parent field is set correctly - # for i in [1, 2, 3]: - # self.assertEqual(links[i].group_parent, True, msg=f"resource {i} should be a group parent now, but isn't!") - # for i in [4, 5]: - # self.assertEqual(links[i].group_parent, False, msg=f"resource {i} should not be a group parent now, but is!") + # See examples below. + def perform_group_reset_test(self, config): + # Update the configured link's group assignments + to_update = FederalRegisterLink.objects.get(id=config["to_update"]) + to_update.resource_groups.set(config["new_groups"]) + to_update.save() + + # Get the configured link again + updated = FederalRegisterLink.objects.annotate( + related_resources_set=distinct_array_agg("related_resources"), + related_categories_set=distinct_array_agg("related_categories"), + related_subjects_set=distinct_array_agg("related_subjects"), + related_citations_set=distinct_array_agg("related_citations"), + ).distinct().get(id=config["to_update"]) + + # Verify that its related_X fields are reset + updated_related_fields = config["updated_related_fields"] + self.assertCountEqual(updated.related_resources_set, updated_related_fields["related_resources"]) + self.assertCountEqual(updated.related_categories_set, updated_related_fields["related_categories"]) + self.assertCountEqual(updated.related_subjects_set, updated_related_fields["related_subjects"]) + self.assertCountEqual(updated.related_citations_set, updated_related_fields["related_citations"]) + + # Get the other (still grouped) links + links = FederalRegisterLink.objects.filter(resource_groups__isnull=False).order_by("id").annotate( + related_resources_set=distinct_array_agg("related_resources"), + related_categories_set=distinct_array_agg("related_categories"), + related_subjects_set=distinct_array_agg("related_subjects"), + related_citations_set=distinct_array_agg("related_citations"), + ).distinct() + + # To simplify the following code, get link by pk instead of array index + def get_link(i): return [link for link in links if link.id == i][0] + + # Verify group_parent field is set correctly + for i in config["parents_true"]: + self.assertEqual(get_link(i).group_parent, True, msg=f"resource {i} should be a group parent now, but isn't!") + for i in config["parents_false"]: + self.assertEqual(get_link(i).group_parent, False, msg=f"resource {i} should not be a group parent now, but is!") + + # Verify related fields are set correctly + expected = config["grouped_related_fields"] + for i in [link.pk for link in links]: + self.assertCountEqual( + get_link(i).related_resources_set, + expected[i], + msg=f"related_resources for resource {i} is incorrect!", + ) + self.assertCountEqual( + get_link(i).related_categories_set, + expected[i] + [i], + msg=f"related_categories for resource {i} is incorrect!", + ) + self.assertCountEqual( + get_link(i).related_subjects_set, + expected[i] + [i], + msg=f"related_subjects for resource {i} is incorrect!", + ) + + def test_group_removal_item_0(self): + config = { + "to_update": 0, + "new_groups": [], + "updated_related_fields": { + "related_resources": [], + "related_categories": [0], + "related_subjects": [0], + "related_citations": [0, 1], + }, + "parents_true": [1, 2, 3], + "parents_false": [4, 5], + "grouped_related_fields": { + 1: [3], + 2: [4], + 3: [1, 5], + 4: [2], + 5: [3], + }, + } + self.perform_group_reset_test(config) + + def test_group_removal_item_1(self): + config = { + "to_update": 1, + "new_groups": [], + "updated_related_fields": { + "related_resources": [], + "related_categories": [1], + "related_subjects": [1], + "related_citations": [1, 2], + }, + "parents_true": [0, 3], + "parents_false": [2, 4, 5], + "grouped_related_fields": { + 0: [2, 4, 3, 5], + 2: [0, 4], + 3: [0, 5], + 4: [0, 2], + 5: [0, 3], + }, + } + self.perform_group_reset_test(config) + + def test_group_update_item_3(self): + config = { + "to_update": 3, + "new_groups": [0, 1], + "updated_related_fields": { + "related_resources": [0, 2, 4, 5], + "related_categories": [0, 2, 3, 4, 5], + "related_subjects": [0, 2, 3, 4, 5], + "related_citations": [0, 1, 2, 3, 4, 5, 6], + }, + "parents_true": [0, 1], + "parents_false": [2, 3, 4, 5], + "grouped_related_fields": { + 0: [2, 4, 3, 5], + 1: [], + 2: [0, 3, 4], + 3: [0, 2, 4, 5], + 4: [0, 2, 3], + 5: [0, 3], + }, + } + self.perform_group_reset_test(config) + + def test_group_add_item_5(self): + config = { + "to_update": 5, + "new_groups": [1, 2], + "updated_related_fields": { + "related_resources": [0, 3, 1], + "related_categories": [0, 3, 1, 5], + "related_subjects": [0, 3, 1, 5], + "related_citations": [0, 1, 2, 3, 4, 5, 6], + }, + "parents_true": [0, 1], + "parents_false": [2, 3, 4, 5], + "grouped_related_fields": { + 0: [2, 3, 4, 5], + 1: [3, 5], + 2: [0, 4], + 3: [0, 1, 5], + 4: [0, 2], + 5: [0, 3, 1], + }, + } + self.perform_group_reset_test(config) + + def test_group_add_item_6(self): + config = { + "to_update": 6, + "new_groups": [1, 2], + "updated_related_fields": { + "related_resources": [0, 3, 1, 5], + "related_categories": [0, 3, 1, 5, 6], + "related_subjects": [0, 3, 1, 5, 6], + "related_citations": [0, 1, 2, 3, 4, 5, 6, 7], + }, + "parents_true": [0, 1], + "parents_false": [2, 3, 4, 5], + "grouped_related_fields": { + 0: [2, 3, 4, 5, 6], + 1: [3, 6], + 2: [0, 4], + 3: [0, 1, 5, 6], + 4: [0, 2], + 5: [0, 3, 6], + 6: [0, 1, 3, 5], + }, + } + self.perform_group_reset_test(config)