From ae0aa15b83bc9a66eb8a09b311f205cd7b58c777 Mon Sep 17 00:00:00 2001
From: Ben Bonfil <b.bonfil@uu.nl>
Date: Thu, 17 Aug 2023 10:17:05 +0200
Subject: [PATCH] Rerun search queries when cache files are missing (#90)

* trying to fix cache issue

* added test

* fixed type warning

* filtering of result objects was incorrect in the previous commit

* more robust check_results()

* avoid ambiguity around FileNotFoundError by creating cache file in post_save
---
 backend/search/models.py | 62 +++++++++++++++++++++++++++++-----------
 backend/search/tests.py  | 47 ++++++++++++++++++++++++++++++
 backend/search/views.py  |  4 ---
 web-ui/angular.json      |  3 +-
 4 files changed, 95 insertions(+), 21 deletions(-)
diff --git a/backend/search/models.py b/backend/search/models.py
index d4f369cf..f231c81c 100644
--- a/backend/search/models.py
+++ b/backend/search/models.py
@@ -2,11 +2,12 @@
 from django.utils import timezone
 from django.db.models import F, Sum
 from django.conf import settings
-from django.db.models.signals import pre_delete
+from django.db.models.signals import pre_delete, post_save
 from django.dispatch import receiver
 
 from copy import deepcopy
 import logging
+import os
 import pathlib
 import re
 from datetime import timedelta
@@ -64,15 +65,20 @@ def _get_cache_path(self, create_dir: bool) -> pathlib.Path:
                 raise SearchError('Could not create caching directory')
         return settings.CACHING_DIR / str(self.id)
 
+    def check_results(self) -> bool:
+        try:
+            self.get_results()
+            return True
+        except Exception:
+            logger.exception('Failed reading results of ComponentSearchQuery: %d', self.pk)
+
+        return False
+
     def get_results(self) -> ResultSet:
         """Return results as a dict"""
         cache_filename = str(self._get_cache_path(False))
-        try:
-            cache_file = open(cache_filename, 'r')
-        except FileNotFoundError:
-            # This can happen if search results are requested before
-            # searching had started, and is not necessarily an error
-            return []
+        cache_file = open(cache_filename, 'r')
+
         results = cache_file.read()
         cache_file.close()
         self.last_accessed = timezone.now()
@@ -83,6 +89,11 @@ def get_results(self) -> ResultSet:
         self.save(update_fields=['last_accessed'])
         return parse_search_result(results, self.component.slug)
 
+    def get_completed_part(self) -> Optional[int]:
+        if self.check_results():
+            return self.completed_part
+        return None
+
     def _truncate_results(self, results: str, number: int) -> str:
         matches = list(re.finditer('<match>', results))
         start_of_nth_match = matches[number].span()[0]
@@ -180,6 +191,9 @@ def perform_search(self, query_id=None):
         self.last_accessed = timezone.now()
         self.save()
 
+    def init_cache_file(self):
+        self._get_cache_path(False).touch()
+
     def delete_cache_file(self):
         """Delete the cache file belonging to this ComponentSearchResult.
         This method is called automatically on delete."""
@@ -240,6 +254,10 @@ def purge_cache(cls):
                            'maximum size.'.format(number_deleted))
 
 
+@receiver(post_save, sender=ComponentSearchResult)
+def component_search_result_create_callback(sender, instance, using, **kwargs):
+    instance.init_cache_file()
+
 @receiver(pre_delete, sender=ComponentSearchResult)
 def delete_basex_db_callback(sender, instance, using, **kwargs):
     instance.delete_cache_file()
@@ -339,10 +357,10 @@ def get_results(self, max_results: Optional[int] = None, exclude: Optional[Set[s
 
         for result_obj in self._component_results():
             # Count completed part (for all results)
-            if result_obj.completed_part is not None:
-                completed_part += result_obj.completed_part
-                percentage = result_obj.completed_part / \
-                    max(1, result_obj.component.total_database_size * 100)
+            part = result_obj.get_completed_part()
+            if part is not None:
+                completed_part += part
+                percentage = part / max(1, result_obj.component.total_database_size * 100)
                 counts.append({
                     'component': result_obj.component.slug,
                     'number_of_results': self._count_results(result_obj),
@@ -372,27 +390,39 @@ def perform_search(self) -> None:
         # completed yet, and starting with those that have not started yet
         # (because those for which search has already started may finish
         # early).
-        result_objs = self.results \
-            .filter(search_completed__isnull=True)
+        result_objs = self.results.filter(search_completed__isnull=True)
         # add failed result objects (with errors and no results)
         result_objs |= self.results.filter(number_of_results=0).exclude(errors=None).exclude(errors='')
 
         result_objs = result_objs.order_by(F('completed_part').desc(nulls_first=True),
                                            'component__slug')
 
+        result_objs = list(result_objs)
+        # append results that should be complete but can't be read
+        result_objs += [r for r in self.results.filter(search_completed__isnull=False) if not r.check_results()]
+
+        # loop through the linked ComponentSearchResults.
+        # for each component, we have to either run the query (perform_search)
+        # or read the results that were already collected (get_results)
         for result_obj in result_objs:
-            # Check if search has been completed by now by a concurrent
-            # search. If so, continue
             result_obj.refresh_from_db()
+            # if search has been completed, we expect to be able to read the results
             if result_obj.search_completed and not result_obj.errors:
-                continue
+                # kinda roundabout way to make sure the results are readable before skipping it
+                # make sure the results are accessible, because reading the cache might fail
+                if result_obj.check_results():
+                    # results are readable, skip the rest of the loop
+                    continue
             try:
                 result_obj.perform_search(self.id)
             except SearchError:
+                logger.error('Failed executing query for ComponentSearchResult (%d)', result_obj.pk)
                 raise
+
             # Check if search has been cancelled in the meantime
             self.refresh_from_db(fields=['cancelled'])
             if self.cancelled:
+                # skip the rest of the components
                 break
 
     def get_errors(self) -> str:
diff --git a/backend/search/tests.py b/backend/search/tests.py
index 7b98c260..1b25c926 100644
--- a/backend/search/tests.py
+++ b/backend/search/tests.py
@@ -6,6 +6,8 @@
 import lxml.etree as etree
 import tempfile
 import pathlib
+import os
+import shutil
 
 from treebanks.models import Treebank
 from services.basex import basex
@@ -310,3 +312,48 @@ def test_perform_count(self):
         counts = sq.perform_count()
         self.assertEqual(counts['troonrede19'], 4)
         self.assertEqual(counts['troonrede20'], 3)
+
+
+    def test_missing_cache(self):
+        with self.settings(CACHING_DIR=test_cache_path):
+            ComponentSearchResult.objects.all().delete()
+            # SQ with full treebank
+            sq = SearchQuery(xpath=XPATH1)
+            sq.save()
+            components = test_treebank.components.all()
+            sq.components.add(*components)
+            sq.initialize()
+            sq.perform_search()
+            results = list(sq.get_results()[0])
+            self.assertGreater(len(results), 0)
+
+            # now run the same query, so that cache is used
+            # but first remove the cache files
+            shutil.rmtree(test_cache_path)
+
+            sq = SearchQuery(xpath=XPATH1)
+            sq.save()
+            components = test_treebank.components.all()
+            sq.components.add(*components)
+            sq.initialize()
+            sq.perform_search()
+            results = list(sq.get_results()[0])
+            self.assertGreater(len(results), 0)
+
+    def test_read_before_search(self):
+        with self.settings(CACHING_DIR=test_cache_path):
+            # Make sure there are no results left from other tests
+            ComponentSearchResult.objects.all().delete()
+            for f in test_cache_path.glob('*'):
+                os.unlink(f)
+
+            # SQ with full treebank
+            sq = SearchQuery(xpath=XPATH1)
+            sq.save()
+            components = test_treebank.components.all()
+            sq.components.add(*components)
+            sq.initialize()
+
+            results = list(sq.get_results()[0])
+            # the test here is that nothing throws
+            self.assertEqual(len(results), 0)
diff --git a/backend/search/views.py b/backend/search/views.py
index bd4e66fc..2ef29cca 100644
--- a/backend/search/views.py
+++ b/backend/search/views.py
@@ -30,10 +30,6 @@
 log = logging.getLogger(__name__)
 
 
-def run_search(query_obj) -> None:
-    query_obj.perform_search()
-
-
 def _create_component_on_the_fly(component_slug: str, _treebank: str) -> None:
     '''Try to create a component object consisting of one database
     with the same name. Also create Treebank object if it does not yet
diff --git a/web-ui/angular.json b/web-ui/angular.json
index d0afadfd..7cac46d9 100644
--- a/web-ui/angular.json
+++ b/web-ui/angular.json
@@ -2,7 +2,8 @@
     "$schema": "./node_modules/@angular/cli/lib/config/schema.json",
     "version": 1,
     "cli": {
-        "packageManager": "yarn"
+        "packageManager": "yarn",
+        "analytics": false
     },
     "newProjectRoot": "projects",
     "projects": {