From 21ed17afd69f5dde326f67a1b82ca866e0e70877 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 13 Aug 2021 21:57:23 +0300 Subject: [PATCH 1/8] store skos graphs (also) as joblib dumps which can be loaded quickly --- annif/corpus/skos.py | 16 +++++++++++++--- annif/vocab.py | 7 +++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index ba4f04406..c80bc4a6c 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -1,6 +1,7 @@ """Support for subjects loaded from a SKOS/RDF file""" import shutil +import joblib import rdflib import rdflib.util from rdflib.namespace import SKOS, RDF, OWL @@ -22,6 +23,8 @@ def serialize_subjects_to_skos(subjects, language, path): SKOS.notation, rdflib.Literal(subject.notation))) graph.serialize(destination=path, format='turtle') + # also dump the graph in joblib format which is faster to load + joblib.dump(graph, path.replace('.ttl', '.joblib')) class SubjectFileSKOS(SubjectCorpus): @@ -30,8 +33,12 @@ class SubjectFileSKOS(SubjectCorpus): def __init__(self, path, language): self.path = path self.language = language - self.graph = rdflib.Graph() - self.graph.load(self.path, format=rdflib.util.guess_format(self.path)) + if path.endswith('.joblib'): + self.graph = joblib.load(path) + else: + self.graph = rdflib.Graph() + self.graph.load(self.path, + format=rdflib.util.guess_format(self.path)) @property def subjects(self): @@ -73,7 +80,10 @@ def save_skos(self, path, language): if self.path.endswith('.ttl'): # input is already in Turtle syntax, no need to reserialize - shutil.copyfile(self.path, path) + if self.path != path: + shutil.copyfile(self.path, path) else: # need to serialize into Turtle self.graph.serialize(destination=path, format='turtle') + # also dump the graph in joblib format which is faster to load + joblib.dump(self.graph, path.replace('.ttl', '.joblib')) diff --git a/annif/vocab.py b/annif/vocab.py index b60d3d382..f64371186 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -59,12 +59,19 @@ def subjects(self): @property def skos(self): """return the subject vocabulary from SKOS file""" + if self._skos_vocab is None: + dumppath = os.path.join(self.datadir, 'subjects.joblib') + if os.path.exists(dumppath): + logger.debug(f'loading graph dump from {dumppath}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, + self.language) if self._skos_vocab is None: path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): logger.debug(f'loading graph from {path}') self._skos_vocab = annif.corpus.SubjectFileSKOS(path, self.language) + self._skos_vocab.save_skos(path, self.language) else: raise NotInitializedException(f'graph file {path} not found') return self._skos_vocab From 02c0f06949bea3acefea1622147e82f164ce05b9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 15:45:59 +0300 Subject: [PATCH 2/8] save SKOS graph dumps gzipped to save disk space --- annif/corpus/skos.py | 6 +++--- annif/vocab.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index c80bc4a6c..aaceb4ecd 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -24,7 +24,7 @@ def serialize_subjects_to_skos(subjects, language, path): rdflib.Literal(subject.notation))) graph.serialize(destination=path, format='turtle') # also dump the graph in joblib format which is faster to load - joblib.dump(graph, path.replace('.ttl', '.joblib')) + joblib.dump(graph, path.replace('.ttl', '.joblib.gz')) class SubjectFileSKOS(SubjectCorpus): @@ -33,7 +33,7 @@ class SubjectFileSKOS(SubjectCorpus): def __init__(self, path, language): self.path = path self.language = language - if path.endswith('.joblib'): + if path.endswith('.joblib.gz'): self.graph = joblib.load(path) else: self.graph = rdflib.Graph() @@ -86,4 +86,4 @@ def save_skos(self, path, language): # need to serialize into Turtle self.graph.serialize(destination=path, format='turtle') # also dump the graph in joblib format which is faster to load - joblib.dump(self.graph, path.replace('.ttl', '.joblib')) + joblib.dump(self.graph, path.replace('.ttl', '.joblib.gz')) diff --git a/annif/vocab.py b/annif/vocab.py index f64371186..632e34e10 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -60,7 +60,7 @@ def subjects(self): def skos(self): """return the subject vocabulary from SKOS file""" if self._skos_vocab is None: - dumppath = os.path.join(self.datadir, 'subjects.joblib') + dumppath = os.path.join(self.datadir, 'subjects.joblib.gz') if os.path.exists(dumppath): logger.debug(f'loading graph dump from {dumppath}') self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, From e39d98984695c0e12189c3826ba35ff3c8494856 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 15:54:48 +0300 Subject: [PATCH 3/8] better path comparison to avoid unnecessary copying --- annif/corpus/skos.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index aaceb4ecd..2f0d60adf 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -1,5 +1,6 @@ """Support for subjects loaded from a SKOS/RDF file""" +import os.path import shutil import joblib import rdflib @@ -80,7 +81,8 @@ def save_skos(self, path, language): if self.path.endswith('.ttl'): # input is already in Turtle syntax, no need to reserialize - if self.path != path: + if not os.path.exists(path) or \ + not os.path.samefile(self.path, path): shutil.copyfile(self.path, path) else: # need to serialize into Turtle From 4a5beb01635cb4c436830a274952b2df52d8536f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 16:31:44 +0300 Subject: [PATCH 4/8] Test that the SKOS graph dump is stored in loadvoc command --- tests/test_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index e21a7e104..56ab26f8f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -129,6 +129,8 @@ def test_loadvoc_tsv(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_tsv_with_bom(testdatadir): @@ -148,6 +150,8 @@ def test_loadvoc_tsv_with_bom(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_rdf(testdatadir): @@ -167,6 +171,8 @@ def test_loadvoc_rdf(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_ttl(testdatadir): @@ -186,6 +192,8 @@ def test_loadvoc_ttl(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 assert testdatadir.join('vocabs/yso-fi/subjects.ttl').exists() assert testdatadir.join('vocabs/yso-fi/subjects.ttl').size() > 0 + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').exists() + assert testdatadir.join('vocabs/yso-fi/subjects.joblib.gz').size() > 0 def test_loadvoc_nonexistent_path(): From c6fb395b242fc742b8f92b2971353fd3aaf45564 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 16:46:29 +0300 Subject: [PATCH 5/8] Add tests for AnnifVocabulary.skos property method --- tests/test_vocab.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index c56de5d51..ea44a40f8 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -84,6 +84,25 @@ def test_update_subject_index_with_added_subjects(tmpdir): '42.42') +def test_skos(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + assert isinstance(vocab.skos, annif.corpus.SubjectFileSKOS) + + +def test_skos_cache(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').remove() + assert not tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + + assert isinstance(vocab.skos, annif.corpus.SubjectFileSKOS) + # cached dump file has been recreated in .skos property access + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + + def test_as_graph(tmpdir): vocab = load_dummy_vocab(tmpdir) graph = vocab.as_graph() From 2f5a1061d9a6e75aadb90f618be8d4eb1a5fd195 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 17:03:44 +0300 Subject: [PATCH 6/8] refactor AnnifVocabulary.skos property method --- annif/vocab.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/annif/vocab.py b/annif/vocab.py index 632e34e10..cce4f0ff1 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -65,13 +65,17 @@ def skos(self): logger.debug(f'loading graph dump from {dumppath}') self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, self.language) - if self._skos_vocab is None: + return self._skos_vocab + + # graph dump file not found - parse ttl file instead path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): logger.debug(f'loading graph from {path}') self._skos_vocab = annif.corpus.SubjectFileSKOS(path, self.language) + # store the dump file so we can use it next time self._skos_vocab.save_skos(path, self.language) + return self._skos_vocab else: raise NotInitializedException(f'graph file {path} not found') return self._skos_vocab From 7c11d1ea81b4b24e1dff90b605a9dbf8acd44cce Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 17:06:54 +0300 Subject: [PATCH 7/8] refactor AnnifVocabulary.skos more by inverting the if check --- annif/vocab.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/annif/vocab.py b/annif/vocab.py index cce4f0ff1..ba368fb0e 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -59,26 +59,28 @@ def subjects(self): @property def skos(self): """return the subject vocabulary from SKOS file""" - if self._skos_vocab is None: - dumppath = os.path.join(self.datadir, 'subjects.joblib.gz') - if os.path.exists(dumppath): - logger.debug(f'loading graph dump from {dumppath}') - self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, - self.language) - return self._skos_vocab - - # graph dump file not found - parse ttl file instead - path = os.path.join(self.datadir, 'subjects.ttl') - if os.path.exists(path): - logger.debug(f'loading graph from {path}') - self._skos_vocab = annif.corpus.SubjectFileSKOS(path, - self.language) - # store the dump file so we can use it next time - self._skos_vocab.save_skos(path, self.language) - return self._skos_vocab - else: - raise NotInitializedException(f'graph file {path} not found') - return self._skos_vocab + if self._skos_vocab is not None: + return self._skos_vocab + + # attempt to load graph from dump file + dumppath = os.path.join(self.datadir, 'subjects.joblib.gz') + if os.path.exists(dumppath): + logger.debug(f'loading graph dump from {dumppath}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, + self.language) + return self._skos_vocab + + # graph dump file not found - parse ttl file instead + path = os.path.join(self.datadir, 'subjects.ttl') + if os.path.exists(path): + logger.debug(f'loading graph from {path}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(path, + self.language) + # store the dump file so we can use it next time + self._skos_vocab.save_skos(path, self.language) + return self._skos_vocab + + raise NotInitializedException(f'graph file {path} not found') def load_vocabulary(self, subject_corpus, language): """load subjects from a subject corpus and save them into a From 5bd425343983284ebad22633fcc944d420094136 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 16 Aug 2021 17:20:57 +0300 Subject: [PATCH 8/8] Add test for case when SKOS vocabulary file not found --- tests/test_vocab.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index ea44a40f8..d098d9a0c 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -1,8 +1,10 @@ """Unit tests for vocabulary functionality in Annif""" +import pytest import os import annif.corpus import annif.vocab +from annif.exception import NotInitializedException import rdflib.namespace @@ -103,6 +105,17 @@ def test_skos_cache(tmpdir): assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() +def test_skos_not_found(tmpdir): + vocab = load_dummy_vocab(tmpdir) + assert tmpdir.join('vocabs/vocab-id/subjects.ttl').exists() + assert tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').exists() + tmpdir.join('vocabs/vocab-id/subjects.ttl').remove() + tmpdir.join('vocabs/vocab-id/subjects.joblib.gz').remove() + + with pytest.raises(NotInitializedException): + vocab.skos + + def test_as_graph(tmpdir): vocab = load_dummy_vocab(tmpdir) graph = vocab.as_graph()