From ae0aa15b83bc9a66eb8a09b311f205cd7b58c777 Mon Sep 17 00:00:00 2001 From: Ben Bonfil Date: Thu, 17 Aug 2023 10:17:05 +0200 Subject: [PATCH] Rerun search queries when cache files are missing (#90) * trying to fix cache issue * added test * fixed type warning * filtering of result objects was incorrect in the previous commit * more robust check_results() * avoid ambiguity around FileNotFoundError by creating cache file in post_save --- backend/search/models.py | 62 +++++++++++++++++++++++++++++----------- backend/search/tests.py | 47 ++++++++++++++++++++++++++++++ backend/search/views.py | 4 --- web-ui/angular.json | 3 +- 4 files changed, 95 insertions(+), 21 deletions(-) diff --git a/backend/search/models.py b/backend/search/models.py index d4f369cf..f231c81c 100644 --- a/backend/search/models.py +++ b/backend/search/models.py @@ -2,11 +2,12 @@ from django.utils import timezone from django.db.models import F, Sum from django.conf import settings -from django.db.models.signals import pre_delete +from django.db.models.signals import pre_delete, post_save from django.dispatch import receiver from copy import deepcopy import logging +import os import pathlib import re from datetime import timedelta @@ -64,15 +65,20 @@ def _get_cache_path(self, create_dir: bool) -> pathlib.Path: raise SearchError('Could not create caching directory') return settings.CACHING_DIR / str(self.id) + def check_results(self) -> bool: + try: + self.get_results() + return True + except Exception: + logger.exception('Failed reading results of ComponentSearchQuery: %d', self.pk) + + return False + def get_results(self) -> ResultSet: """Return results as a dict""" cache_filename = str(self._get_cache_path(False)) - try: - cache_file = open(cache_filename, 'r') - except FileNotFoundError: - # This can happen if search results are requested before - # searching had started, and is not necessarily an error - return [] + cache_file = open(cache_filename, 'r') + results = cache_file.read() cache_file.close() self.last_accessed = timezone.now() @@ -83,6 +89,11 @@ def get_results(self) -> ResultSet: self.save(update_fields=['last_accessed']) return parse_search_result(results, self.component.slug) + def get_completed_part(self) -> Optional[int]: + if self.check_results(): + return self.completed_part + return None + def _truncate_results(self, results: str, number: int) -> str: matches = list(re.finditer('', results)) start_of_nth_match = matches[number].span()[0] @@ -180,6 +191,9 @@ def perform_search(self, query_id=None): self.last_accessed = timezone.now() self.save() + def init_cache_file(self): + self._get_cache_path(False).touch() + def delete_cache_file(self): """Delete the cache file belonging to this ComponentSearchResult. This method is called automatically on delete.""" @@ -240,6 +254,10 @@ def purge_cache(cls): 'maximum size.'.format(number_deleted)) +@receiver(post_save, sender=ComponentSearchResult) +def component_search_result_create_callback(sender, instance, using, **kwargs): + instance.init_cache_file() + @receiver(pre_delete, sender=ComponentSearchResult) def delete_basex_db_callback(sender, instance, using, **kwargs): instance.delete_cache_file() @@ -339,10 +357,10 @@ def get_results(self, max_results: Optional[int] = None, exclude: Optional[Set[s for result_obj in self._component_results(): # Count completed part (for all results) - if result_obj.completed_part is not None: - completed_part += result_obj.completed_part - percentage = result_obj.completed_part / \ - max(1, result_obj.component.total_database_size * 100) + part = result_obj.get_completed_part() + if part is not None: + completed_part += part + percentage = part / max(1, result_obj.component.total_database_size * 100) counts.append({ 'component': result_obj.component.slug, 'number_of_results': self._count_results(result_obj), @@ -372,27 +390,39 @@ def perform_search(self) -> None: # completed yet, and starting with those that have not started yet # (because those for which search has already started may finish # early). - result_objs = self.results \ - .filter(search_completed__isnull=True) + result_objs = self.results.filter(search_completed__isnull=True) # add failed result objects (with errors and no results) result_objs |= self.results.filter(number_of_results=0).exclude(errors=None).exclude(errors='') result_objs = result_objs.order_by(F('completed_part').desc(nulls_first=True), 'component__slug') + result_objs = list(result_objs) + # append results that should be complete but can't be read + result_objs += [r for r in self.results.filter(search_completed__isnull=False) if not r.check_results()] + + # loop through the linked ComponentSearchResults. + # for each component, we have to either run the query (perform_search) + # or read the results that were already collected (get_results) for result_obj in result_objs: - # Check if search has been completed by now by a concurrent - # search. If so, continue result_obj.refresh_from_db() + # if search has been completed, we expect to be able to read the results if result_obj.search_completed and not result_obj.errors: - continue + # kinda roundabout way to make sure the results are readable before skipping it + # make sure the results are accessible, because reading the cache might fail + if result_obj.check_results(): + # results are readable, skip the rest of the loop + continue try: result_obj.perform_search(self.id) except SearchError: + logger.error('Failed executing query for ComponentSearchResult (%d)', result_obj.pk) raise + # Check if search has been cancelled in the meantime self.refresh_from_db(fields=['cancelled']) if self.cancelled: + # skip the rest of the components break def get_errors(self) -> str: diff --git a/backend/search/tests.py b/backend/search/tests.py index 7b98c260..1b25c926 100644 --- a/backend/search/tests.py +++ b/backend/search/tests.py @@ -6,6 +6,8 @@ import lxml.etree as etree import tempfile import pathlib +import os +import shutil from treebanks.models import Treebank from services.basex import basex @@ -310,3 +312,48 @@ def test_perform_count(self): counts = sq.perform_count() self.assertEqual(counts['troonrede19'], 4) self.assertEqual(counts['troonrede20'], 3) + + + def test_missing_cache(self): + with self.settings(CACHING_DIR=test_cache_path): + ComponentSearchResult.objects.all().delete() + # SQ with full treebank + sq = SearchQuery(xpath=XPATH1) + sq.save() + components = test_treebank.components.all() + sq.components.add(*components) + sq.initialize() + sq.perform_search() + results = list(sq.get_results()[0]) + self.assertGreater(len(results), 0) + + # now run the same query, so that cache is used + # but first remove the cache files + shutil.rmtree(test_cache_path) + + sq = SearchQuery(xpath=XPATH1) + sq.save() + components = test_treebank.components.all() + sq.components.add(*components) + sq.initialize() + sq.perform_search() + results = list(sq.get_results()[0]) + self.assertGreater(len(results), 0) + + def test_read_before_search(self): + with self.settings(CACHING_DIR=test_cache_path): + # Make sure there are no results left from other tests + ComponentSearchResult.objects.all().delete() + for f in test_cache_path.glob('*'): + os.unlink(f) + + # SQ with full treebank + sq = SearchQuery(xpath=XPATH1) + sq.save() + components = test_treebank.components.all() + sq.components.add(*components) + sq.initialize() + + results = list(sq.get_results()[0]) + # the test here is that nothing throws + self.assertEqual(len(results), 0) diff --git a/backend/search/views.py b/backend/search/views.py index bd4e66fc..2ef29cca 100644 --- a/backend/search/views.py +++ b/backend/search/views.py @@ -30,10 +30,6 @@ log = logging.getLogger(__name__) -def run_search(query_obj) -> None: - query_obj.perform_search() - - def _create_component_on_the_fly(component_slug: str, _treebank: str) -> None: '''Try to create a component object consisting of one database with the same name. Also create Treebank object if it does not yet diff --git a/web-ui/angular.json b/web-ui/angular.json index d0afadfd..7cac46d9 100644 --- a/web-ui/angular.json +++ b/web-ui/angular.json @@ -2,7 +2,8 @@ "$schema": "./node_modules/@angular/cli/lib/config/schema.json", "version": 1, "cli": { - "packageManager": "yarn" + "packageManager": "yarn", + "analytics": false }, "newProjectRoot": "projects", "projects": {