From 5c1e9ca904d2e9892d1181b15e129b783d9b5a74 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 09:38:46 +0200 Subject: [PATCH 01/18] Added group membership to test thesaurus. --- tests/corpora/archaeology/yso-archaeology.rdf | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tests/corpora/archaeology/yso-archaeology.rdf b/tests/corpora/archaeology/yso-archaeology.rdf index cde905a91..66755cf4f 100644 --- a/tests/corpora/archaeology/yso-archaeology.rdf +++ b/tests/corpora/archaeology/yso-archaeology.rdf @@ -1119,4 +1119,140 @@ sigillvetenskap sigillografi + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 51 Archaeology + 51 Arkeologi + 51 Arkeologia + From 662f01fdef3c59c42a042ce4575ef2fb7de89885 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 09:41:05 +0200 Subject: [PATCH 02/18] Added stwfsapy backend. --- annif/backend/__init__.py | 6 ++ annif/backend/stwfsapy.py | 119 +++++++++++++++++++++++++++++++++ tests/test_backend_stwfsapy.py | 116 ++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 annif/backend/stwfsapy.py create mode 100644 tests/test_backend_stwfsapy.py diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 9ad0e0fe5..f001740e1 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -56,3 +56,9 @@ def get_backend(backend_id): register_backend(omikuji.OmikujiBackend) except ImportError: annif.logger.debug("Omikuji not available, not enabling omikuji backend") + +try: + from . import stwfsapy + register_backend(stwfsapy.StwfsapyBackend) +except ImportError: + annif.logger.debug("STWFSAPY not available, not enabling STWFSAPY backend") diff --git a/annif/backend/stwfsapy.py b/annif/backend/stwfsapy.py new file mode 100644 index 000000000..b04625794 --- /dev/null +++ b/annif/backend/stwfsapy.py @@ -0,0 +1,119 @@ +import os +from rdflib import Graph +from rdflib.util import guess_format +from stwfsapy.predictor import StwfsapyPredictor +from annif.exception import NotInitializedException, NotSupportedException +from annif.suggestion import ListSuggestionResult, SubjectSuggestion +from . import backend +from annif.util import boolean + + +_KEY_GRAPH_PATH = 'graph_path' +_KEY_CONCEPT_TYPE_URI = 'concept_type_uri' +_KEY_SUBTHESAURUS_TYPE_URI = 'sub_thesaurus_type_uri' +_KEY_THESAURUS_RELATION_TYPE_URI = 'thesaurus_relation_type_uri' +_KEY_THESAURUS_RELATION_IS_SPECIALISATION = ( + 'thesaurus_relation_is_specialisation') +_KEY_REMOVE_DEPRECATED = 'remove_deprecated' +_KEY_HANDLE_TITLE_CASE = 'handle_title_case' +_KEY_EXTRACT_UPPER_CASE_FROM_BRACES = 'extract_upper_case_from_braces' +_KEY_EXTRACT_ANY_CASE_FROM_BRACES = 'extract_any_case_from_braces' +_KEY_EXPAND_AMPERSAND_WITH_SPACES = 'expand_ampersand_with_spaces' +_KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION = ( + 'expand_abbreviation_with_punctuation') +_KEY_SIMPLE_ENGLISH_PLURAL_RULES = 'simple_english_plural_rules' + + +class StwfsapyBackend(backend.AnnifBackend): + + name = "stwfsapy" + needs_subject_index = False + + STWFSAPY_PARAMETERS = { + _KEY_GRAPH_PATH: str, + _KEY_CONCEPT_TYPE_URI: str, + _KEY_SUBTHESAURUS_TYPE_URI: str, + _KEY_THESAURUS_RELATION_TYPE_URI: str, + _KEY_THESAURUS_RELATION_IS_SPECIALISATION: boolean, + _KEY_REMOVE_DEPRECATED: boolean, + _KEY_HANDLE_TITLE_CASE: boolean, + _KEY_EXTRACT_UPPER_CASE_FROM_BRACES: boolean, + _KEY_EXTRACT_ANY_CASE_FROM_BRACES: boolean, + _KEY_EXPAND_AMPERSAND_WITH_SPACES: boolean, + _KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION: boolean, + _KEY_SIMPLE_ENGLISH_PLURAL_RULES: boolean, + } + + DEFAULT_PARAMETERS = { + _KEY_THESAURUS_RELATION_IS_SPECIALISATION: False, + _KEY_REMOVE_DEPRECATED: True, + _KEY_HANDLE_TITLE_CASE: True, + _KEY_EXTRACT_UPPER_CASE_FROM_BRACES: True, + _KEY_EXTRACT_ANY_CASE_FROM_BRACES: False, + _KEY_EXPAND_AMPERSAND_WITH_SPACES: True, + _KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION: True, + _KEY_SIMPLE_ENGLISH_PLURAL_RULES: False, + } + + MODEL_FILE = 'stwfsapy_predictor.zip' + + _model = None + + def initialize(self): + if self._model is None: + path = os.path.join(self.datadir, self.MODEL_FILE) + self.debug(f'Loading STWFSAPY model from {path}.') + if os.path.exists(path): + self._model = StwfsapyPredictor.load(path) + self.debug('Loaded model.') + else: + raise NotInitializedException( + f'Model not found at {path}', + backend_id=self.backend_id) + + def _train(self, corpus, params): + if corpus == 'cached': + raise NotSupportedException( + 'Training stwfsapy project from cached data not supported.') + if corpus.is_empty(): + raise NotSupportedException( + 'Cannot train stwfsapy project with no documents.') + self.debug("Transforming training data.") + X = [doc.text for doc in corpus.documents] + y = [doc.uris for doc in corpus.documents] + graph = Graph() + graph_path = params[_KEY_GRAPH_PATH] + graph.load(graph_path, format=guess_format(graph_path)) + new_params = { + key: self.STWFSAPY_PARAMETERS[key](val) + for key, val + in params.items() + if key in self.STWFSAPY_PARAMETERS + } + new_params.pop(_KEY_GRAPH_PATH) + p = StwfsapyPredictor( + graph=graph, + langs=frozenset([params['language']]), + **new_params) + p.fit(X, y) + self._model = p + p.store(os.path.join(self.datadir, self.MODEL_FILE)) + + def _suggest(self, text, params): + self.debug( + f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') + result = self._model.suggest_proba([text])[0] + suggestions = [] + for uri, score in result: + subject_id = self.project.subjects.by_uri(uri) + if subject_id: + label = self.project.subjects[subject_id][1] + else: + label = None + suggestion = SubjectSuggestion( + uri, + label, + None, + score) + suggestions.append(suggestion) + return ListSuggestionResult(suggestions) diff --git a/tests/test_backend_stwfsapy.py b/tests/test_backend_stwfsapy.py new file mode 100644 index 000000000..eed92e900 --- /dev/null +++ b/tests/test_backend_stwfsapy.py @@ -0,0 +1,116 @@ +import os +from annif.backend import get_backend +import annif.corpus +from annif.backend.stwfsapy import StwfsapyBackend +from annif.exception import NotSupportedException + +import pytest + +_rdf_file_path = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.rdf') + +_backend_conf = { + 'graph_path': _rdf_file_path, + 'language': 'fi', + 'concept_type_uri': 'http://www.w3.org/2004/02/skos/core#Concept', + 'sub_thesaurus_type_uri': + 'http://www.w3.org/2004/02/skos/core#Collection', + 'thesaurus_relation_type_uri': + 'http://www.w3.org/2004/02/skos/core#member', + 'thesaurus_relation_is_specialisation': True, +} + + +def test_stwfsapy_default_params(project): + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params={}, + project=project + ) + expected_default_params = { + 'thesaurus_relation_is_specialisation': False, + 'remove_deprecated': True, + 'handle_title_case': True, + 'extract_upper_case_from_braces': True, + 'extract_any_case_from_braces': False, + 'expand_ampersand_with_spaces': True, + 'expand_abbreviation_with_punctuation': True, + 'simple_english_plural_rules': False + } + actual_params = stwfsapy.params + assert expected_default_params == actual_params + + +def test_stwfsapy_train(document_corpus, project, datadir): + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params=_backend_conf, + project=project) + stwfsapy.train(document_corpus) + assert stwfsapy._model is not None + model_file = datadir.join(stwfsapy.MODEL_FILE) + assert model_file.exists() + assert model_file.size() > 0 + + +def test_empty_corpus(project): + corpus = annif.corpus.DocumentList([]) + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params=dict(), + project=project) + with pytest.raises(NotSupportedException): + stwfsapy.train(corpus) + + +def test_cached_corpus(project): + corpus = 'cached' + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params=dict(), + project=project) + with pytest.raises(NotSupportedException): + stwfsapy.train(corpus) + + +def test_stwfsapy_suggest_unknown(project): + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params=dict(), + project=project) + results = stwfsapy.suggest('1234') + assert len(results) == 0 + + +def test_stwfsapy_suggest(project, datadir): + stwfsapy_type = get_backend(StwfsapyBackend.name) + stwfsapy = stwfsapy_type( + backend_id=StwfsapyBackend.name, + config_params=dict(), + project=project) + # Just some randomly selected words, taken from YSO archaeology group. + # And "random" words between them + results = stwfsapy.suggest("""random + muinais-DNA random random + labyrintit random random random + Eurooppalainen yleissopimus arkeologisen perinnön suojelusta random + Indus-kulttuuri random random random random + kiinteät muinaisjäännökset random random + makrofossiilit random + Mesa Verde random random random random + muinaismuistoalueet random random random + zikkuratit random random + termoluminesenssi random random random""") + assert len(results) == 10 + hits = results.as_list(project.subjects) + assert 'http://www.yso.fi/onto/yso/p14174' in [ + result.uri for result in hits] + assert 'labyrintit' in [result.label for result in hits] From 4a972137a137b21fe6eb4972d6ba89c4d57dbb29 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 10:46:11 +0200 Subject: [PATCH 03/18] Add STWFSAPY as dependency to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a4090f711..c8d0e5e84 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ def read(fname): 'vw': ['vowpalwabbit==8.8.1'], 'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'], 'omikuji': ['omikuji==0.3.*'], + 'stwfsapy': ['stwfsapy==0.1.2'], 'dev': [ 'codecov', 'pytest-cov', From 2b0bb2c1b3c722e6bd13ee24d57db4d5d950bae2 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 15:49:28 +0200 Subject: [PATCH 04/18] Add as_graph method to vocabulary. --- annif/vocab.py | 10 ++++++++++ tests/test_vocab.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/annif/vocab.py b/annif/vocab.py index 83731efa0..2355d1f3b 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -1,6 +1,7 @@ """Vocabulary management functionality for Annif""" import os.path +import rdflib.graph import annif import annif.corpus import annif.util @@ -69,3 +70,12 @@ def load_vocabulary(self, subject_corpus, language): def as_skos(self): """return the vocabulary as a file object, in SKOS/Turtle syntax""" return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb') + + def as_graph(self): + """return the vocabulary as an rdflib graph""" + g = rdflib.graph.Graph() + g.load( + os.path.join(self.datadir, 'subjects.ttl'), + format='ttl' + ) + return g diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 53ac1dd47..247eb0826 100755 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -3,6 +3,7 @@ import os import annif.corpus import annif.vocab +import rdflib.term def load_dummy_vocab(tmpdir): @@ -81,3 +82,11 @@ def test_update_subject_index_with_added_subjects(tmpdir): assert vocab.subjects.by_uri('http://example.org/new-dummy') == 2 assert vocab.subjects[2] == ('http://example.org/new-dummy', 'new dummy', '42.42') + + +def test_as_graph(tmpdir): + vocab = load_dummy_vocab(tmpdir) + graph = vocab.as_graph() + tpls = list(graph[ + :rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#prefLabel'):]) + assert len(tpls) == 2 From 84539a8d9b41597d5d6a94fae83d896f06441620 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 15:50:18 +0200 Subject: [PATCH 05/18] Use internal graph in stwfsapy backend. --- annif/backend/stwfsapy.py | 11 ++--------- tests/test_backend_stwfsapy.py | 27 +++++++++++++++++++-------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/annif/backend/stwfsapy.py b/annif/backend/stwfsapy.py index b04625794..56f3ef47f 100644 --- a/annif/backend/stwfsapy.py +++ b/annif/backend/stwfsapy.py @@ -1,6 +1,4 @@ import os -from rdflib import Graph -from rdflib.util import guess_format from stwfsapy.predictor import StwfsapyPredictor from annif.exception import NotInitializedException, NotSupportedException from annif.suggestion import ListSuggestionResult, SubjectSuggestion @@ -8,7 +6,6 @@ from annif.util import boolean -_KEY_GRAPH_PATH = 'graph_path' _KEY_CONCEPT_TYPE_URI = 'concept_type_uri' _KEY_SUBTHESAURUS_TYPE_URI = 'sub_thesaurus_type_uri' _KEY_THESAURUS_RELATION_TYPE_URI = 'thesaurus_relation_type_uri' @@ -27,10 +24,9 @@ class StwfsapyBackend(backend.AnnifBackend): name = "stwfsapy" - needs_subject_index = False + needs_subject_index = True STWFSAPY_PARAMETERS = { - _KEY_GRAPH_PATH: str, _KEY_CONCEPT_TYPE_URI: str, _KEY_SUBTHESAURUS_TYPE_URI: str, _KEY_THESAURUS_RELATION_TYPE_URI: str, @@ -81,16 +77,13 @@ def _train(self, corpus, params): self.debug("Transforming training data.") X = [doc.text for doc in corpus.documents] y = [doc.uris for doc in corpus.documents] - graph = Graph() - graph_path = params[_KEY_GRAPH_PATH] - graph.load(graph_path, format=guess_format(graph_path)) + graph = self.project.vocab.as_graph() new_params = { key: self.STWFSAPY_PARAMETERS[key](val) for key, val in params.items() if key in self.STWFSAPY_PARAMETERS } - new_params.pop(_KEY_GRAPH_PATH) p = StwfsapyPredictor( graph=graph, langs=frozenset([params['language']]), diff --git a/tests/test_backend_stwfsapy.py b/tests/test_backend_stwfsapy.py index eed92e900..afb00ec67 100644 --- a/tests/test_backend_stwfsapy.py +++ b/tests/test_backend_stwfsapy.py @@ -1,19 +1,30 @@ import os from annif.backend import get_backend +from rdflib import Graph import annif.corpus from annif.backend.stwfsapy import StwfsapyBackend from annif.exception import NotSupportedException import pytest +from unittest.mock import Mock + + +@pytest.fixture +def graph_project(project): + _rdf_file_path = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.rdf') + g = Graph() + g.load(_rdf_file_path) + mock_vocab = Mock() + mock_vocab.as_graph.return_value = g + project.vocab = mock_vocab + return project -_rdf_file_path = os.path.join( - os.path.dirname(__file__), - 'corpora', - 'archaeology', - 'yso-archaeology.rdf') _backend_conf = { - 'graph_path': _rdf_file_path, 'language': 'fi', 'concept_type_uri': 'http://www.w3.org/2004/02/skos/core#Concept', 'sub_thesaurus_type_uri': @@ -45,12 +56,12 @@ def test_stwfsapy_default_params(project): assert expected_default_params == actual_params -def test_stwfsapy_train(document_corpus, project, datadir): +def test_stwfsapy_train(document_corpus, graph_project, datadir): stwfsapy_type = get_backend(StwfsapyBackend.name) stwfsapy = stwfsapy_type( backend_id=StwfsapyBackend.name, config_params=_backend_conf, - project=project) + project=graph_project) stwfsapy.train(document_corpus) assert stwfsapy._model is not None model_file = datadir.join(stwfsapy.MODEL_FILE) From 176888f61e4c6c464941931c083404223cf2c210 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 15:52:28 +0200 Subject: [PATCH 06/18] Perform only single iteration through document corpus when training stwfsapy backend. --- annif/backend/stwfsapy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/annif/backend/stwfsapy.py b/annif/backend/stwfsapy.py index 56f3ef47f..0bd841f5e 100644 --- a/annif/backend/stwfsapy.py +++ b/annif/backend/stwfsapy.py @@ -75,8 +75,11 @@ def _train(self, corpus, params): raise NotSupportedException( 'Cannot train stwfsapy project with no documents.') self.debug("Transforming training data.") - X = [doc.text for doc in corpus.documents] - y = [doc.uris for doc in corpus.documents] + X = [] + y = [] + for doc in corpus.documents: + X.append(doc.text) + y.append(doc.uris) graph = self.project.vocab.as_graph() new_params = { key: self.STWFSAPY_PARAMETERS[key](val) From c498676460d9589a9ff14a19eb72ae9f072f170a Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 21 Aug 2020 16:39:40 +0200 Subject: [PATCH 07/18] Further specify test for as_graph method of vocab. --- tests/test_vocab.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 247eb0826..51a28e2b5 100755 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -3,7 +3,7 @@ import os import annif.corpus import annif.vocab -import rdflib.term +import rdflib.namespace def load_dummy_vocab(tmpdir): @@ -87,6 +87,21 @@ def test_update_subject_index_with_added_subjects(tmpdir): def test_as_graph(tmpdir): vocab = load_dummy_vocab(tmpdir) graph = vocab.as_graph() - tpls = list(graph[ - :rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#prefLabel'):]) - assert len(tpls) == 2 + labels = [ + (str(tpl[0]), str(tpl[1])) + for tpl + in graph[ + :rdflib.namespace.SKOS.prefLabel:] + ] + assert len(labels) == 2 + assert ('http://example.org/dummy', 'dummy') in labels + assert ('http://example.org/none', 'none') in labels + concepts = [ + str(tpl) + for tpl + in graph[ + :rdflib.namespace.RDF.type:rdflib.namespace.SKOS.Concept] + ] + assert len(concepts) == 2 + assert 'http://example.org/dummy' in concepts + assert 'http://example.org/none' in concepts From b93e221a79b7cedb128b3bfac473768f13fc55af Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 24 Aug 2020 15:19:03 +0200 Subject: [PATCH 08/18] Add default arguments forstwfsapy backend thesaurus type and thesaurus relation. --- annif/backend/stwfsapy.py | 2 ++ tests/test_backend_stwfsapy.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/annif/backend/stwfsapy.py b/annif/backend/stwfsapy.py index 0bd841f5e..da4eca994 100644 --- a/annif/backend/stwfsapy.py +++ b/annif/backend/stwfsapy.py @@ -41,6 +41,8 @@ class StwfsapyBackend(backend.AnnifBackend): } DEFAULT_PARAMETERS = { + _KEY_SUBTHESAURUS_TYPE_URI: '', + _KEY_THESAURUS_RELATION_TYPE_URI: '', _KEY_THESAURUS_RELATION_IS_SPECIALISATION: False, _KEY_REMOVE_DEPRECATED: True, _KEY_HANDLE_TITLE_CASE: True, diff --git a/tests/test_backend_stwfsapy.py b/tests/test_backend_stwfsapy.py index afb00ec67..22025f4d6 100644 --- a/tests/test_backend_stwfsapy.py +++ b/tests/test_backend_stwfsapy.py @@ -43,6 +43,8 @@ def test_stwfsapy_default_params(project): project=project ) expected_default_params = { + 'sub_thesaurus_type_uri': '', + 'thesaurus_relation_type_uri': '', 'thesaurus_relation_is_specialisation': False, 'remove_deprecated': True, 'handle_title_case': True, From b29977283d52fe6c5b721f736c2aa7a265aed27b Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 24 Aug 2020 15:38:37 +0200 Subject: [PATCH 09/18] Make stwfsapy a default backend. --- annif/backend/__init__.py | 8 ++------ setup.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index f001740e1..a14f21ab2 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -6,6 +6,7 @@ from . import tfidf from . import pav from . import maui +from . import stwfsapy import annif @@ -29,6 +30,7 @@ def get_backend(backend_id): register_backend(tfidf.TFIDFBackend) register_backend(pav.PAVBackend) register_backend(maui.MauiBackend) +register_backend(stwfsapy.StwfsapyBackend) # Optional backends try: @@ -56,9 +58,3 @@ def get_backend(backend_id): register_backend(omikuji.OmikujiBackend) except ImportError: annif.logger.debug("Omikuji not available, not enabling omikuji backend") - -try: - from . import stwfsapy - register_backend(stwfsapy.StwfsapyBackend) -except ImportError: - annif.logger.debug("STWFSAPY not available, not enabling STWFSAPY backend") diff --git a/setup.py b/setup.py index c8d0e5e84..f0c85c1a6 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ def read(fname): 'gunicorn', 'numpy==1.18.*', 'optuna==2.2.0' + 'stwfsapy==0.1.3', ], tests_require=['py', 'pytest', 'requests'], extras_require={ @@ -43,7 +44,6 @@ def read(fname): 'vw': ['vowpalwabbit==8.8.1'], 'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'], 'omikuji': ['omikuji==0.3.*'], - 'stwfsapy': ['stwfsapy==0.1.2'], 'dev': [ 'codecov', 'pytest-cov', From 82a6d1c4bd5eda153993f5bd14d1c10fe991ac30 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Thu, 7 Jan 2021 15:00:39 +0100 Subject: [PATCH 10/18] Renamed stwfaspy backend to stwfsa. --- annif/backend/__init__.py | 4 +- annif/backend/{stwfsapy.py => stwfsa.py} | 18 +++--- setup.py | 4 +- ...end_stwfsapy.py => test_backend_stwfsa.py} | 62 +++++++++---------- 4 files changed, 44 insertions(+), 44 deletions(-) rename annif/backend/{stwfsapy.py => stwfsa.py} (89%) rename tests/{test_backend_stwfsapy.py => test_backend_stwfsa.py} (68%) diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index a14f21ab2..10d81caaa 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -6,7 +6,7 @@ from . import tfidf from . import pav from . import maui -from . import stwfsapy +from . import stwfsa import annif @@ -30,7 +30,7 @@ def get_backend(backend_id): register_backend(tfidf.TFIDFBackend) register_backend(pav.PAVBackend) register_backend(maui.MauiBackend) -register_backend(stwfsapy.StwfsapyBackend) +register_backend(stwfsa.StwfsaBackend) # Optional backends try: diff --git a/annif/backend/stwfsapy.py b/annif/backend/stwfsa.py similarity index 89% rename from annif/backend/stwfsapy.py rename to annif/backend/stwfsa.py index da4eca994..d61e54854 100644 --- a/annif/backend/stwfsapy.py +++ b/annif/backend/stwfsa.py @@ -21,12 +21,12 @@ _KEY_SIMPLE_ENGLISH_PLURAL_RULES = 'simple_english_plural_rules' -class StwfsapyBackend(backend.AnnifBackend): +class StwfsaBackend(backend.AnnifBackend): - name = "stwfsapy" + name = "stwfsa" needs_subject_index = True - STWFSAPY_PARAMETERS = { + STWFSA_PARAMETERS = { _KEY_CONCEPT_TYPE_URI: str, _KEY_SUBTHESAURUS_TYPE_URI: str, _KEY_THESAURUS_RELATION_TYPE_URI: str, @@ -53,14 +53,14 @@ class StwfsapyBackend(backend.AnnifBackend): _KEY_SIMPLE_ENGLISH_PLURAL_RULES: False, } - MODEL_FILE = 'stwfsapy_predictor.zip' + MODEL_FILE = 'stwfsa_predictor.zip' _model = None def initialize(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) - self.debug(f'Loading STWFSAPY model from {path}.') + self.debug(f'Loading STWFSA model from {path}.') if os.path.exists(path): self._model = StwfsapyPredictor.load(path) self.debug('Loaded model.') @@ -72,10 +72,10 @@ def initialize(self): def _train(self, corpus, params): if corpus == 'cached': raise NotSupportedException( - 'Training stwfsapy project from cached data not supported.') + 'Training stwfsa project from cached data not supported.') if corpus.is_empty(): raise NotSupportedException( - 'Cannot train stwfsapy project with no documents.') + 'Cannot train stwfsa project with no documents.') self.debug("Transforming training data.") X = [] y = [] @@ -84,10 +84,10 @@ def _train(self, corpus, params): y.append(doc.uris) graph = self.project.vocab.as_graph() new_params = { - key: self.STWFSAPY_PARAMETERS[key](val) + key: self.STWFSA_PARAMETERS[key](val) for key, val in params.items() - if key in self.STWFSAPY_PARAMETERS + if key in self.STWFSA_PARAMETERS } p = StwfsapyPredictor( graph=graph, diff --git a/setup.py b/setup.py index f0c85c1a6..6f546dc41 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ def read(fname): 'rdflib', 'gunicorn', 'numpy==1.18.*', - 'optuna==2.2.0' - 'stwfsapy==0.1.3', + 'optuna==2.2.0', + 'stwfsapy==0.1.4' ], tests_require=['py', 'pytest', 'requests'], extras_require={ diff --git a/tests/test_backend_stwfsapy.py b/tests/test_backend_stwfsa.py similarity index 68% rename from tests/test_backend_stwfsapy.py rename to tests/test_backend_stwfsa.py index 22025f4d6..09652e223 100644 --- a/tests/test_backend_stwfsapy.py +++ b/tests/test_backend_stwfsa.py @@ -2,7 +2,7 @@ from annif.backend import get_backend from rdflib import Graph import annif.corpus -from annif.backend.stwfsapy import StwfsapyBackend +from annif.backend.stwfsa import StwfsaBackend from annif.exception import NotSupportedException import pytest @@ -35,10 +35,10 @@ def graph_project(project): } -def test_stwfsapy_default_params(project): - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, +def test_stwfsa_default_params(project): + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params={}, project=project ) @@ -54,64 +54,64 @@ def test_stwfsapy_default_params(project): 'expand_abbreviation_with_punctuation': True, 'simple_english_plural_rules': False } - actual_params = stwfsapy.params + actual_params = stwfsa.params assert expected_default_params == actual_params -def test_stwfsapy_train(document_corpus, graph_project, datadir): - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, +def test_stwfsa_train(document_corpus, graph_project, datadir): + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params=_backend_conf, project=graph_project) - stwfsapy.train(document_corpus) - assert stwfsapy._model is not None - model_file = datadir.join(stwfsapy.MODEL_FILE) + stwfsa.train(document_corpus) + assert stwfsa._model is not None + model_file = datadir.join(stwfsa.MODEL_FILE) assert model_file.exists() assert model_file.size() > 0 def test_empty_corpus(project): corpus = annif.corpus.DocumentList([]) - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params=dict(), project=project) with pytest.raises(NotSupportedException): - stwfsapy.train(corpus) + stwfsa.train(corpus) def test_cached_corpus(project): corpus = 'cached' - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params=dict(), project=project) with pytest.raises(NotSupportedException): - stwfsapy.train(corpus) + stwfsa.train(corpus) -def test_stwfsapy_suggest_unknown(project): - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, +def test_stwfsa_suggest_unknown(project): + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params=dict(), project=project) - results = stwfsapy.suggest('1234') + results = stwfsa.suggest('1234') assert len(results) == 0 -def test_stwfsapy_suggest(project, datadir): - stwfsapy_type = get_backend(StwfsapyBackend.name) - stwfsapy = stwfsapy_type( - backend_id=StwfsapyBackend.name, +def test_stwfsa_suggest(project, datadir): + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id=StwfsaBackend.name, config_params=dict(), project=project) # Just some randomly selected words, taken from YSO archaeology group. # And "random" words between them - results = stwfsapy.suggest("""random + results = stwfsa.suggest("""random muinais-DNA random random labyrintit random random random Eurooppalainen yleissopimus arkeologisen perinnön suojelusta random From 783cea8a0937cb04c4eacbf71af8033d4d9f653a Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Fri, 8 Jan 2021 11:56:30 +0100 Subject: [PATCH 11/18] Add `input_limit` parameter to stwfsa backend. --- annif/backend/stwfsa.py | 4 ++++ setup.py | 2 +- tests/test_backend_stwfsa.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index d61e54854..3def4ec55 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -19,6 +19,7 @@ _KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION = ( 'expand_abbreviation_with_punctuation') _KEY_SIMPLE_ENGLISH_PLURAL_RULES = 'simple_english_plural_rules' +_KEY_INPUT_LIMIT = 'input_limit' class StwfsaBackend(backend.AnnifBackend): @@ -38,6 +39,7 @@ class StwfsaBackend(backend.AnnifBackend): _KEY_EXPAND_AMPERSAND_WITH_SPACES: boolean, _KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION: boolean, _KEY_SIMPLE_ENGLISH_PLURAL_RULES: boolean, + _KEY_INPUT_LIMIT: int, } DEFAULT_PARAMETERS = { @@ -51,6 +53,7 @@ class StwfsaBackend(backend.AnnifBackend): _KEY_EXPAND_AMPERSAND_WITH_SPACES: True, _KEY_EXPAND_ABBREVIATION_WITH_PUNCTUATION: True, _KEY_SIMPLE_ENGLISH_PLURAL_RULES: False, + _KEY_INPUT_LIMIT: 0, } MODEL_FILE = 'stwfsa_predictor.zip' @@ -89,6 +92,7 @@ def _train(self, corpus, params): in params.items() if key in self.STWFSA_PARAMETERS } + new_params.pop(_KEY_INPUT_LIMIT) p = StwfsapyPredictor( graph=graph, langs=frozenset([params['language']]), diff --git a/setup.py b/setup.py index 6f546dc41..18f0eafff 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ def read(fname): 'gunicorn', 'numpy==1.18.*', 'optuna==2.2.0', - 'stwfsapy==0.1.4' + 'stwfsapy==0.1.5' ], tests_require=['py', 'pytest', 'requests'], extras_require={ diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index 09652e223..fb523a116 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -52,7 +52,8 @@ def test_stwfsa_default_params(project): 'extract_any_case_from_braces': False, 'expand_ampersand_with_spaces': True, 'expand_abbreviation_with_punctuation': True, - 'simple_english_plural_rules': False + 'simple_english_plural_rules': False, + 'input_limit': 0, } actual_params = stwfsa.params assert expected_default_params == actual_params From f1b24ed7847f0449e199b1b2a9f172a4d4a1f8b7 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 11:41:06 +0100 Subject: [PATCH 12/18] Better defaults for concepts, thesauri and their relation in the STWFSA backend. --- annif/backend/stwfsa.py | 7 ++++--- tests/test_backend_stwfsa.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index 3def4ec55..2f4596741 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -43,9 +43,10 @@ class StwfsaBackend(backend.AnnifBackend): } DEFAULT_PARAMETERS = { - _KEY_SUBTHESAURUS_TYPE_URI: '', - _KEY_THESAURUS_RELATION_TYPE_URI: '', - _KEY_THESAURUS_RELATION_IS_SPECIALISATION: False, + _KEY_CONCEPT_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#Concept', + _KEY_SUBTHESAURUS_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#Collection', + _KEY_THESAURUS_RELATION_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#member', + _KEY_THESAURUS_RELATION_IS_SPECIALISATION: True, _KEY_REMOVE_DEPRECATED: True, _KEY_HANDLE_TITLE_CASE: True, _KEY_EXTRACT_UPPER_CASE_FROM_BRACES: True, diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index fb523a116..eab474e1d 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -43,9 +43,10 @@ def test_stwfsa_default_params(project): project=project ) expected_default_params = { - 'sub_thesaurus_type_uri': '', - 'thesaurus_relation_type_uri': '', - 'thesaurus_relation_is_specialisation': False, + 'concept_type_uri': 'http://www.w3.org/2004/02/skos/core#Concept', + 'sub_thesaurus_type_uri': 'http://www.w3.org/2004/02/skos/core#Collection', + 'thesaurus_relation_type_uri': 'http://www.w3.org/2004/02/skos/core#member', + 'thesaurus_relation_is_specialisation': True, 'remove_deprecated': True, 'handle_title_case': True, 'extract_upper_case_from_braces': True, From 85ed0207b8d85890eb078742696d54fe5f85863b Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 11:49:33 +0100 Subject: [PATCH 13/18] Remove group info from archaeology test vocabulary. --- tests/corpora/archaeology/yso-archaeology.rdf | 136 ------------------ 1 file changed, 136 deletions(-) diff --git a/tests/corpora/archaeology/yso-archaeology.rdf b/tests/corpora/archaeology/yso-archaeology.rdf index 66755cf4f..cde905a91 100644 --- a/tests/corpora/archaeology/yso-archaeology.rdf +++ b/tests/corpora/archaeology/yso-archaeology.rdf @@ -1119,140 +1119,4 @@ sigillvetenskap sigillografi - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 51 Archaeology - 51 Arkeologi - 51 Arkeologia - From 2f936530c33310c1cfefafb9cb7199c5158dc3d9 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 11:56:41 +0100 Subject: [PATCH 14/18] Use atomic_save in STWFSA backend. --- annif/backend/stwfsa.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index 2f4596741..d1977b8d7 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -3,7 +3,7 @@ from annif.exception import NotInitializedException, NotSupportedException from annif.suggestion import ListSuggestionResult, SubjectSuggestion from . import backend -from annif.util import boolean +from annif.util import atomic_save, boolean _KEY_CONCEPT_TYPE_URI = 'concept_type_uri' @@ -44,8 +44,10 @@ class StwfsaBackend(backend.AnnifBackend): DEFAULT_PARAMETERS = { _KEY_CONCEPT_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#Concept', - _KEY_SUBTHESAURUS_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#Collection', - _KEY_THESAURUS_RELATION_TYPE_URI: 'http://www.w3.org/2004/02/skos/core#member', + _KEY_SUBTHESAURUS_TYPE_URI: + 'http://www.w3.org/2004/02/skos/core#Collection', + _KEY_THESAURUS_RELATION_TYPE_URI: + 'http://www.w3.org/2004/02/skos/core#member', _KEY_THESAURUS_RELATION_IS_SPECIALISATION: True, _KEY_REMOVE_DEPRECATED: True, _KEY_HANDLE_TITLE_CASE: True, @@ -100,7 +102,11 @@ def _train(self, corpus, params): **new_params) p.fit(X, y) self._model = p - p.store(os.path.join(self.datadir, self.MODEL_FILE)) + atomic_save( + p, + self.datadir, + self.MODEL_FILE, + lambda model, store_path: model.store(store_path)) def _suggest(self, text, params): self.debug( From 920305c63d2fb1889fd22c1aa4cfcaef580805f1 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 11:58:30 +0100 Subject: [PATCH 15/18] Add version constraint to rdflib. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 18f0eafff..2e7cfadc6 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def read(fname): 'gensim==3.8.*', 'scikit-learn==0.23.2', 'scipy==1.5.3', - 'rdflib', + 'rdflib>=4.2,<6.0', 'gunicorn', 'numpy==1.18.*', 'optuna==2.2.0', From 3c400a80decf5c5730ea38f92971c636eadde020 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 12:17:42 +0100 Subject: [PATCH 16/18] Add test for uninitialized STWFSA backend. --- tests/test_backend_stwfsa.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index eab474e1d..34f397b70 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -3,7 +3,7 @@ from rdflib import Graph import annif.corpus from annif.backend.stwfsa import StwfsaBackend -from annif.exception import NotSupportedException +from annif.exception import NotInitializedException, NotSupportedException import pytest from unittest.mock import Mock @@ -44,8 +44,10 @@ def test_stwfsa_default_params(project): ) expected_default_params = { 'concept_type_uri': 'http://www.w3.org/2004/02/skos/core#Concept', - 'sub_thesaurus_type_uri': 'http://www.w3.org/2004/02/skos/core#Collection', - 'thesaurus_relation_type_uri': 'http://www.w3.org/2004/02/skos/core#member', + 'sub_thesaurus_type_uri': + 'http://www.w3.org/2004/02/skos/core#Collection', + 'thesaurus_relation_type_uri': + 'http://www.w3.org/2004/02/skos/core#member', 'thesaurus_relation_is_specialisation': True, 'remove_deprecated': True, 'handle_title_case': True, @@ -60,6 +62,17 @@ def test_stwfsa_default_params(project): assert expected_default_params == actual_params +def test_stwfsa_not_initialized(project): + stwfsa_type = get_backend(StwfsaBackend.name) + stwfsa = stwfsa_type( + backend_id='stwfsa', + config_params={}, + project=project) + with pytest.raises(NotInitializedException): + stwfsa.suggest("example text") + + + def test_stwfsa_train(document_corpus, graph_project, datadir): stwfsa_type = get_backend(StwfsaBackend.name) stwfsa = stwfsa_type( @@ -105,6 +118,7 @@ def test_stwfsa_suggest_unknown(project): assert len(results) == 0 + def test_stwfsa_suggest(project, datadir): stwfsa_type = get_backend(StwfsaBackend.name) stwfsa = stwfsa_type( From 0e1e881a8c51e0246cc005d7cfcee342ead5b50e Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 13:24:50 +0100 Subject: [PATCH 17/18] Remove blank lines in STWFSA test file. --- tests/test_backend_stwfsa.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index 34f397b70..352092f11 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -72,7 +72,6 @@ def test_stwfsa_not_initialized(project): stwfsa.suggest("example text") - def test_stwfsa_train(document_corpus, graph_project, datadir): stwfsa_type = get_backend(StwfsaBackend.name) stwfsa = stwfsa_type( @@ -118,7 +117,6 @@ def test_stwfsa_suggest_unknown(project): assert len(results) == 0 - def test_stwfsa_suggest(project, datadir): stwfsa_type = get_backend(StwfsaBackend.name) stwfsa = stwfsa_type( From fc18969fa466ec8b38c9889443998c69ae3b0603 Mon Sep 17 00:00:00 2001 From: Moritz Fuerneisen Date: Mon, 25 Jan 2021 17:49:37 +0100 Subject: [PATCH 18/18] Try to reduce cyclomatic complexity in stwfsa backend. --- annif/backend/stwfsa.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index d1977b8d7..65d57ab08 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -75,7 +75,7 @@ def initialize(self): f'Model not found at {path}', backend_id=self.backend_id) - def _train(self, corpus, params): + def _load_data(self, corpus): if corpus == 'cached': raise NotSupportedException( 'Training stwfsa project from cached data not supported.') @@ -88,7 +88,10 @@ def _train(self, corpus, params): for doc in corpus.documents: X.append(doc.text) y.append(doc.uris) - graph = self.project.vocab.as_graph() + return X, y + + def _train(self, corpus, params): + X, y = self._load_data(corpus) new_params = { key: self.STWFSA_PARAMETERS[key](val) for key, val @@ -97,7 +100,7 @@ def _train(self, corpus, params): } new_params.pop(_KEY_INPUT_LIMIT) p = StwfsapyPredictor( - graph=graph, + graph=self.project.vocab.as_graph(), langs=frozenset([params['language']]), **new_params) p.fit(X, y)