Skip to content

Commit

Permalink
Merge branch 'develop' into dependabot/pip/backend/scikit-learn-1.5.0
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen authored Jul 11, 2024
2 parents 311f09b + d2cf237 commit 85dda40
Show file tree
Hide file tree
Showing 124 changed files with 2,189 additions and 404 deletions.
20 changes: 20 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,26 @@
"/*": "*",
"/./~/*": "${webRoot}/node_modules/*"
}
},
{
"name": "celery",
"type": "debugpy",
"request": "launch",
"cwd": "${workspaceFolder}/backend",
"env": {
"PYTHONPATH": "${workspaceFolder}/backend"
},
"module": "celery",
"console": "integratedTerminal",
"args": [
"-A",
"ianalyzer.celery",
"worker",
"--pool=solo",
"--concurrency=1",
"--events",
"--loglevel=info"
]
}
],
"inputs": [
Expand Down
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ keywords:
- elasticsearch
- natural language processing
license: MIT
version: 5.7.0
date-released: '2024-06-5'
version: 5.9.0
date-released: '2024-07-05'
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM docker.io/library/python:3.9
# Setting this means stdout and stderr streams are sent to terminal in real time
ENV PYTHONUNBUFFERED 1
# Get required libraries for xmlsec
RUN apt-get -y update
RUN apt-get -y update && apt-get -y upgrade
RUN apt-get install -y pkg-config libxml2-dev libxmlsec1-dev libxmlsec1-openssl default-libmysqlclient-dev

RUN pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion backend/addcorpus/json_corpora/export_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def export_json_corpus(corpus: Corpus) -> Dict:
config = corpus.configuration
data = {'name': corpus.name, 'id': corpus.pk }
data = {'name': corpus.name}
data['meta'] = export_corpus_meta(config)
data['source_data'] = export_corpus_source_data(config)
options = export_corpus_options(config)
Expand Down
5 changes: 2 additions & 3 deletions backend/addcorpus/json_corpora/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
from addcorpus.models import Corpus, Field
from addcorpus.json_corpora.import_json import _parse_field

def test_corpus_export(json_mock_corpus: Corpus, json_corpus_data):
def test_corpus_export(json_mock_corpus: Corpus, json_corpus_definition):
result = export_json_corpus(json_mock_corpus)
result.pop('id')
assert result == json_corpus_data
assert result == json_corpus_definition

def test_field_export(any_field_json):
imported = _parse_field(any_field_json)
Expand Down
39 changes: 26 additions & 13 deletions backend/addcorpus/json_corpora/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
from addcorpus.serializers import CorpusJSONDefinitionSerializer
from addcorpus.models import Corpus, CorpusConfiguration

def test_json_corpus_import(db, json_corpus_data):
Corpus.objects.all().delete()
def test_json_corpus_import(db, json_mock_corpus, json_corpus_definition):
json_mock_corpus.delete()

serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data = {
'definition': json_corpus_definition,
'active': True,
}

serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
corpus = serializer.create(serializer.validated_data)

Expand Down Expand Up @@ -35,30 +40,38 @@ def test_json_corpus_import(db, json_corpus_data):
assert line_field.display_type == 'text_content'


def test_serializer_representation(db, json_corpus_data):
Corpus.objects.all().delete()
def test_serializer_representation(db, json_mock_corpus, json_corpus_definition):
json_mock_corpus.delete()

data = {
'definition': json_corpus_definition,
'active': True,
}

serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
corpus = serializer.create(serializer.validated_data)

serialized = serializer.to_representation(corpus)
serialized.pop('id')
assert json_corpus_data == serialized
assert json_corpus_definition == serialized['definition']

def test_serializer_update(db, json_corpus_data, json_mock_corpus: Corpus):
def test_serializer_update(db, json_corpus_definition, json_mock_corpus: Corpus):
# edit description
json_corpus_data['meta']['description'] = 'A different description'
serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data = {
'definition': json_corpus_definition,
'active': True,
}
data['definition']['meta']['description'] = 'A different description'
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
serializer.update(json_mock_corpus, serializer.validated_data)
corpus_config = CorpusConfiguration.objects.get(corpus=json_mock_corpus)
assert corpus_config.description == 'A different description'

# remove a field
assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 2
json_corpus_data['fields'] = json_corpus_data['fields'][:-1]
serializer = CorpusJSONDefinitionSerializer(data=json_corpus_data)
data['definition']['fields'] = data['definition']['fields'][:-1]
serializer = CorpusJSONDefinitionSerializer(data=data)
assert serializer.is_valid()
serializer.update(json_mock_corpus, serializer.validated_data)
assert Field.objects.filter(corpus_configuration__corpus=json_mock_corpus).count() == 1
Expand Down
8 changes: 4 additions & 4 deletions backend/addcorpus/json_corpora/tests/test_validate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from addcorpus.json_corpora.validate import validate


def test_validate(json_corpus_data):
validate(json_corpus_data)
def test_validate(json_corpus_definition):
validate(json_corpus_definition)


def test_validate_subschema(json_corpus_data):
source_data = json_corpus_data['source_data']
def test_validate_subschema(json_corpus_definition):
source_data = json_corpus_definition['source_data']
validate(source_data, 'properties', 'source_data')
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.11 on 2024-07-05 16:30

import addcorpus.validation.creation
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0022_add_url_display_type'),
]

operations = [
migrations.AlterField(
model_name='corpusdocumentationpage',
name='type',
field=models.CharField(choices=[('general', 'General information'), ('citation', 'Citation'), ('license', 'License'), ('terms_of_service', 'Terms of service'), ('wordmodels', 'Word models')], default='general', help_text='the type of documentation', max_length=16),
),
migrations.AlterField(
model_name='field',
name='name',
field=models.SlugField(help_text='internal name for the field', max_length=126, validators=[addcorpus.validation.creation.validate_name_is_not_a_route_parameter, addcorpus.validation.creation.validate_name_has_no_ner_suffix]),
),
]
29 changes: 25 additions & 4 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
validate_mimetype,
validate_name_is_not_a_route_parameter, validate_search_filter,
validate_search_filter_with_mapping,
validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
validate_search_filter, validate_search_filter_with_mapping,
validate_searchable_field_has_full_text_search,
validate_sort_configuration, validate_visualizations_with_mapping,
validate_source_data_directory,
Expand All @@ -21,6 +21,8 @@
from django.db import models
from django.db.models.constraints import UniqueConstraint

from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
MAX_LENGTH_TITLE = 256
Expand Down Expand Up @@ -260,6 +262,20 @@ def clean(self):
e
])

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
fields = mapping[self.es_index].get(
'mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
return True
except:
return False
return False


FIELD_DISPLAY_TYPES = [
('text_content', 'text content'),
Expand Down Expand Up @@ -293,7 +309,8 @@ def clean(self):
class Field(models.Model):
name = models.SlugField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter],
validators=[validate_name_is_not_a_route_parameter,
validate_name_has_no_ner_suffix],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand Down Expand Up @@ -431,11 +448,12 @@ def clean(self):
e
])


class CorpusDocumentationPage(models.Model):
class PageType(models.TextChoices):
GENERAL = ('general', 'General information')
CITATION = ('citation', 'Citation')
LICENSE = ('license', 'Licence')
LICENSE = ('license', 'License')
TERMS_OF_SERVICE = ('terms_of_service', 'Terms of service')
WORDMODELS = ('wordmodels', 'Word models')

Expand All @@ -455,6 +473,9 @@ class PageType(models.TextChoices):
help_text='markdown contents of the documentation'
)

def __str__(self):
return f'{self.corpus_configuration.corpus.name} - {self.type}'

class Meta:
constraints = [
UniqueConstraint(
Expand Down
42 changes: 31 additions & 11 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class CorpusConfigurationSerializer(serializers.ModelSerializer):
languages = serializers.ListField(child=LanguageField())
category = PrettyChoiceField(choices=CATEGORIES)
default_sort = NonEmptyJSONField()
has_named_entities = serializers.ReadOnlyField()

class Meta:
model = CorpusConfiguration
Expand All @@ -89,6 +90,7 @@ class Meta:
'default_sort',
'language_field',
'fields',
'has_named_entities',
]


Expand Down Expand Up @@ -128,33 +130,47 @@ class Meta:
fields = ['corpus_configuration', 'type', 'content']


class CorpusJSONDefinitionSerializer(serializers.ModelSerializer):
class Meta:
model = Corpus
fields = '__all__'
class JSONDefinitionField(serializers.Field):
def get_attribute(self, instance: Corpus):
return instance

def to_representation(self, instance) -> Dict:
return export_json_corpus(instance)
def to_representation(self, value: Corpus) -> Dict:
return export_json_corpus(value)

def to_internal_value(self, data) -> Dict:
def to_internal_value(self, data: Dict) -> Dict:
return import_json_corpus(data)


class CorpusJSONDefinitionSerializer(serializers.ModelSerializer):
definition = JSONDefinitionField()

class Meta:
model = Corpus
fields = ['id', 'active', 'definition']
read_only_fields = ['id']

def create(self, validated_data: Dict):
configuration_data = validated_data.pop('configuration')
definition_data = validated_data.get('definition')
configuration_data = definition_data.pop('configuration')
fields_data = configuration_data.pop('fields')

corpus = Corpus.objects.create(**validated_data)
corpus = Corpus.objects.create(**definition_data)
configuration = CorpusConfiguration.objects.create(corpus=corpus, **configuration_data)
for field_data in fields_data:
Field.objects.create(corpus_configuration=configuration, **field_data)

if validated_data.get('active') == True:
corpus.active = True
corpus.save()

return corpus

def update(self, instance: Corpus, validated_data: Dict):
configuration_data = validated_data.pop('configuration')
definition_data = validated_data.get('definition')
configuration_data = definition_data.pop('configuration')
fields_data = configuration_data.pop('fields')

corpus = Corpus(pk=instance.pk, **validated_data)
corpus = Corpus(pk=instance.pk, **definition_data)
corpus.save()

configuration, _ = CorpusConfiguration.objects.get_or_create(corpus=corpus)
Expand All @@ -172,4 +188,8 @@ def update(self, instance: Corpus, validated_data: Dict):

configuration.fields.exclude(name__in=(f['name'] for f in fields_data)).delete()

if validated_data.get('active') == True:
corpus.active = True
corpus.save()

return corpus
12 changes: 9 additions & 3 deletions backend/addcorpus/tests/test_corpus_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from addcorpus.python_corpora.save_corpus import load_and_save_all_corpora

def test_no_corpora(db, settings, admin_client):
Corpus.objects.all().delete()
settings.CORPORA = {}
load_and_save_all_corpora()

Expand All @@ -18,9 +19,14 @@ def test_no_corpora(db, settings, admin_client):
def test_corpus_documentation_view(admin_client, basic_mock_corpus, settings):
response = admin_client.get(f'/api/corpus/documentation/{basic_mock_corpus}/')
assert response.status_code == 200
pages = response.data

# check that the pages are sorted in canonical order
page_types = [page['type'] for page in pages]
assert page_types == ['General information', 'Citation', 'License']

# should contain citation guidelines
citation_page = next(page for page in response.data if page['type'] == 'Citation')
citation_page = next(page for page in pages if page['type'] == 'Citation')

# check that the page template is rendered with context
content = citation_page['content']
Expand Down Expand Up @@ -84,7 +90,7 @@ def test_corpus_not_publication_ready(admin_client, basic_mock_corpus):
response = admin_client.get('/api/corpus/')
corpus = not any(c['name'] == basic_mock_corpus for c in response.data)

def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mock_corpus: Corpus):
def test_corpus_edit_views(admin_client: Client, json_corpus_definition: Dict, json_mock_corpus: Corpus):
json_mock_corpus.delete()

response = admin_client.get('/api/corpus/definitions/')
Expand All @@ -93,7 +99,7 @@ def test_corpus_edit_views(admin_client: Client, json_corpus_data: Dict, json_mo

response = admin_client.post(
'/api/corpus/definitions/',
json_corpus_data,
{'definition': json_corpus_definition, 'active': True},
content_type='application/json',
)
assert status.is_success(response.status_code)
Expand Down
7 changes: 7 additions & 0 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ def validate_name_is_not_a_route_parameter(value):
f'{value} cannot be used as a field name, because it is also a route parameter'
)


def validate_name_has_no_ner_suffix(value):
if value.endswith(':ner'):
raise ValidationError(
f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
)

def mapping_can_be_searched(es_mapping):
'''
Verify if a mapping is appropriate for searching
Expand Down
Loading

0 comments on commit 85dda40

Please sign in to comment.