diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index ba4f04406..2f0d60adf 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -1,6 +1,8 @@ """Support for subjects loaded from a SKOS/RDF file""" +import os.path import shutil +import joblib import rdflib import rdflib.util from rdflib.namespace import SKOS, RDF, OWL @@ -22,6 +24,8 @@ def serialize_subjects_to_skos(subjects, language, path): SKOS.notation, rdflib.Literal(subject.notation))) graph.serialize(destination=path, format='turtle') + # also dump the graph in joblib format which is faster to load + joblib.dump(graph, path.replace('.ttl', '.joblib.gz')) class SubjectFileSKOS(SubjectCorpus): @@ -30,8 +34,12 @@ class SubjectFileSKOS(SubjectCorpus): def __init__(self, path, language): self.path = path self.language = language - self.graph = rdflib.Graph() - self.graph.load(self.path, format=rdflib.util.guess_format(self.path)) + if path.endswith('.joblib.gz'): + self.graph = joblib.load(path) + else: + self.graph = rdflib.Graph() + self.graph.load(self.path, + format=rdflib.util.guess_format(self.path)) @property def subjects(self): @@ -73,7 +81,11 @@ def save_skos(self, path, language): if self.path.endswith('.ttl'): # input is already in Turtle syntax, no need to reserialize - shutil.copyfile(self.path, path) + if not os.path.exists(path) or \ + not os.path.samefile(self.path, path): + shutil.copyfile(self.path, path) else: # need to serialize into Turtle self.graph.serialize(destination=path, format='turtle') + # also dump the graph in joblib format which is faster to load + joblib.dump(self.graph, path.replace('.ttl', '.joblib.gz')) diff --git a/annif/vocab.py b/annif/vocab.py index b60d3d382..ba368fb0e 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -59,15 +59,28 @@ def subjects(self): @property def skos(self): """return the subject vocabulary from SKOS file""" - if self._skos_vocab is None: - path = os.path.join(self.datadir, 'subjects.ttl') - if os.path.exists(path): - logger.debug(f'loading graph from {path}') - self._skos_vocab = annif.corpus.SubjectFileSKOS(path, - self.language) - else: - raise NotInitializedException(f'graph file {path} not found') - return self._skos_vocab + if self._skos_vocab is not None: + return self._skos_vocab + + # attempt to load graph from dump file + dumppath = os.path.join(self.datadir, 'subjects.joblib.gz') + if os.path.exists(dumppath): + logger.debug(f'loading graph dump from {dumppath}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, + self.language) + return self._skos_vocab + + # graph dump file not found - parse ttl file instead + path = os.path.join(self.datadir, 'subjects.ttl') + if os.path.exists(path): + logger.debug(f'loading graph from {path}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(path, + self.language) + # store the dump file so we can use it next time + self._skos_vocab.save_skos(path, self.language) + return self._skos_vocab + + raise NotInitializedException(f'graph file {path} not found') def load_vocabulary(self, subject_corpus, language): """load subjects from a subject corpus and save them into a diff --git a/tests/test_cli.py b/tests/test_cli.py index e21a7e104..56ab26f8f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -129,6 +129,8 @@ def test_loadvoc_tsv(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_tsv_with_bom(testdatadir): @@ -148,6 +150,8 @@ def test_loadvoc_tsv_with_bom(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_rdf(testdatadir): @@ -167,6 +171,8 @@ def test_loadvoc_rdf(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_ttl(testdatadir): @@ -186,6 +192,8 @@ def test_loadvoc_ttl(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_nonexistent_path(): diff --git a/tests/test_vocab.py b/tests/test_vocab.py index c56de5d51..d098d9a0c 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -1,8 +1,10 @@ """Unit tests for vocabulary functionality in Annif""" +import pytest import os import annif.corpus import annif.vocab +from annif.exception import NotInitializedException import rdflib.namespace @@ -84,6 +86,36 @@ def test_update_subject_index_with_added_subjects(tmpdir): '42.42') +def test_skos(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + assert isinstance(vocab.skos, annif.corpus.SubjectFileSKOS) + + +def test_skos_cache(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').remove() + assert not tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + + assert isinstance(vocab.skos, annif.corpus.SubjectFileSKOS) + # cached dump file has been recreated in .skos property access + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + + +def test_skos_not_found(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + tmpdir.join('vocabs/vocab-id/subjects.ttl').remove() + tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').remove() + + with pytest.raises(NotInitializedException): + vocab.skos + + def test_as_graph(tmpdir): vocab = load_dummy_vocab(tmpdir) graph = vocab.as_graph()