Skip to content

Commit

Permalink
Rerun search queries when cache files are missing (#90)
Browse files Browse the repository at this point in the history
* trying to fix cache issue

* added test

* fixed type warning

* filtering of result objects was incorrect in the previous commit

* more robust check_results()

* avoid ambiguity around FileNotFoundError by creating cache file in post_save
  • Loading branch information
bbonf authored Aug 17, 2023
1 parent 70febbd commit ae0aa15
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 21 deletions.
62 changes: 46 additions & 16 deletions backend/search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from django.utils import timezone
from django.db.models import F, Sum
from django.conf import settings
from django.db.models.signals import pre_delete
from django.db.models.signals import pre_delete, post_save
from django.dispatch import receiver

from copy import deepcopy
import logging
import os
import pathlib
import re
from datetime import timedelta
Expand Down Expand Up @@ -64,15 +65,20 @@ def _get_cache_path(self, create_dir: bool) -> pathlib.Path:
raise SearchError('Could not create caching directory')
return settings.CACHING_DIR / str(self.id)

def check_results(self) -> bool:
try:
self.get_results()
return True
except Exception:
logger.exception('Failed reading results of ComponentSearchQuery: %d', self.pk)

return False

def get_results(self) -> ResultSet:
"""Return results as a dict"""
cache_filename = str(self._get_cache_path(False))
try:
cache_file = open(cache_filename, 'r')
except FileNotFoundError:
# This can happen if search results are requested before
# searching had started, and is not necessarily an error
return []
cache_file = open(cache_filename, 'r')

results = cache_file.read()
cache_file.close()
self.last_accessed = timezone.now()
Expand All @@ -83,6 +89,11 @@ def get_results(self) -> ResultSet:
self.save(update_fields=['last_accessed'])
return parse_search_result(results, self.component.slug)

def get_completed_part(self) -> Optional[int]:
if self.check_results():
return self.completed_part
return None

def _truncate_results(self, results: str, number: int) -> str:
matches = list(re.finditer('<match>', results))
start_of_nth_match = matches[number].span()[0]
Expand Down Expand Up @@ -180,6 +191,9 @@ def perform_search(self, query_id=None):
self.last_accessed = timezone.now()
self.save()

def init_cache_file(self):
self._get_cache_path(False).touch()

def delete_cache_file(self):
"""Delete the cache file belonging to this ComponentSearchResult.
This method is called automatically on delete."""
Expand Down Expand Up @@ -240,6 +254,10 @@ def purge_cache(cls):
'maximum size.'.format(number_deleted))


@receiver(post_save, sender=ComponentSearchResult)
def component_search_result_create_callback(sender, instance, using, **kwargs):
instance.init_cache_file()

@receiver(pre_delete, sender=ComponentSearchResult)
def delete_basex_db_callback(sender, instance, using, **kwargs):
instance.delete_cache_file()
Expand Down Expand Up @@ -339,10 +357,10 @@ def get_results(self, max_results: Optional[int] = None, exclude: Optional[Set[s

for result_obj in self._component_results():
# Count completed part (for all results)
if result_obj.completed_part is not None:
completed_part += result_obj.completed_part
percentage = result_obj.completed_part / \
max(1, result_obj.component.total_database_size * 100)
part = result_obj.get_completed_part()
if part is not None:
completed_part += part
percentage = part / max(1, result_obj.component.total_database_size * 100)
counts.append({
'component': result_obj.component.slug,
'number_of_results': self._count_results(result_obj),
Expand Down Expand Up @@ -372,27 +390,39 @@ def perform_search(self) -> None:
# completed yet, and starting with those that have not started yet
# (because those for which search has already started may finish
# early).
result_objs = self.results \
.filter(search_completed__isnull=True)
result_objs = self.results.filter(search_completed__isnull=True)
# add failed result objects (with errors and no results)
result_objs |= self.results.filter(number_of_results=0).exclude(errors=None).exclude(errors='')

result_objs = result_objs.order_by(F('completed_part').desc(nulls_first=True),
'component__slug')

result_objs = list(result_objs)
# append results that should be complete but can't be read
result_objs += [r for r in self.results.filter(search_completed__isnull=False) if not r.check_results()]

# loop through the linked ComponentSearchResults.
# for each component, we have to either run the query (perform_search)
# or read the results that were already collected (get_results)
for result_obj in result_objs:
# Check if search has been completed by now by a concurrent
# search. If so, continue
result_obj.refresh_from_db()
# if search has been completed, we expect to be able to read the results
if result_obj.search_completed and not result_obj.errors:
continue
# kinda roundabout way to make sure the results are readable before skipping it
# make sure the results are accessible, because reading the cache might fail
if result_obj.check_results():
# results are readable, skip the rest of the loop
continue
try:
result_obj.perform_search(self.id)
except SearchError:
logger.error('Failed executing query for ComponentSearchResult (%d)', result_obj.pk)
raise

# Check if search has been cancelled in the meantime
self.refresh_from_db(fields=['cancelled'])
if self.cancelled:
# skip the rest of the components
break

def get_errors(self) -> str:
Expand Down
47 changes: 47 additions & 0 deletions backend/search/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import lxml.etree as etree
import tempfile
import pathlib
import os
import shutil

from treebanks.models import Treebank
from services.basex import basex
Expand Down Expand Up @@ -310,3 +312,48 @@ def test_perform_count(self):
counts = sq.perform_count()
self.assertEqual(counts['troonrede19'], 4)
self.assertEqual(counts['troonrede20'], 3)


def test_missing_cache(self):
with self.settings(CACHING_DIR=test_cache_path):
ComponentSearchResult.objects.all().delete()
# SQ with full treebank
sq = SearchQuery(xpath=XPATH1)
sq.save()
components = test_treebank.components.all()
sq.components.add(*components)
sq.initialize()
sq.perform_search()
results = list(sq.get_results()[0])
self.assertGreater(len(results), 0)

# now run the same query, so that cache is used
# but first remove the cache files
shutil.rmtree(test_cache_path)

sq = SearchQuery(xpath=XPATH1)
sq.save()
components = test_treebank.components.all()
sq.components.add(*components)
sq.initialize()
sq.perform_search()
results = list(sq.get_results()[0])
self.assertGreater(len(results), 0)

def test_read_before_search(self):
with self.settings(CACHING_DIR=test_cache_path):
# Make sure there are no results left from other tests
ComponentSearchResult.objects.all().delete()
for f in test_cache_path.glob('*'):
os.unlink(f)

# SQ with full treebank
sq = SearchQuery(xpath=XPATH1)
sq.save()
components = test_treebank.components.all()
sq.components.add(*components)
sq.initialize()

results = list(sq.get_results()[0])
# the test here is that nothing throws
self.assertEqual(len(results), 0)
4 changes: 0 additions & 4 deletions backend/search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@
log = logging.getLogger(__name__)


def run_search(query_obj) -> None:
query_obj.perform_search()


def _create_component_on_the_fly(component_slug: str, _treebank: str) -> None:
'''Try to create a component object consisting of one database
with the same name. Also create Treebank object if it does not yet
Expand Down
3 changes: 2 additions & 1 deletion web-ui/angular.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"$schema": "./node_modules/@angular/cli/lib/config/schema.json",
"version": 1,
"cli": {
"packageManager": "yarn"
"packageManager": "yarn",
"analytics": false
},
"newProjectRoot": "projects",
"projects": {
Expand Down

0 comments on commit ae0aa15

Please sign in to comment.