Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the optimize command #477

Merged
merged 1 commit into from
Mar 23, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import annif.project
import annif.registry
from annif.project import Access
from annif.suggestion import SuggestionFilter
from annif.suggestion import SuggestionFilter, ListSuggestionResult
from annif.exception import ConfigurationException, NotSupportedException

logger = annif.logger
Expand Down Expand Up @@ -85,9 +85,12 @@ def validate_backend_params(backend, beparam, project):
.format(backend, beparam, project.config['backend']))


BATCH_MAX_LIMIT = 15


def generate_filter_batches(subjects):
filter_batches = collections.OrderedDict()
for limit in range(1, 16):
for limit in range(1, BATCH_MAX_LIMIT + 1):
for threshold in [i * 0.05 for i in range(20)]:
hit_filter = SuggestionFilter(subjects, limit, threshold)
batch = annif.eval.EvaluationBatch(subjects)
Expand Down Expand Up @@ -394,7 +397,11 @@ def run_optimize(project_id, paths, docs_limit, backend_param):
ndocs = 0
docs = open_documents(paths, docs_limit)
for doc in docs.documents:
hits = project.suggest(doc.text, backend_params)
raw_hits = project.suggest(doc.text, backend_params)
hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
assert isinstance(hits, ListSuggestionResult), \
"Optimize should only be done with ListSuggestionResult " + \
"as it would be very slow with VectorSuggestionResult."
gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
for hit_filter, batch in filter_batches.values():
batch.evaluate(hit_filter(hits), gold_subjects)
Expand Down