diff --git a/doc/release-notes/11095-fix-extcvoc-indexing.md b/doc/release-notes/11095-fix-extcvoc-indexing.md new file mode 100644 index 00000000000..f4931d81263 --- /dev/null +++ b/doc/release-notes/11095-fix-extcvoc-indexing.md @@ -0,0 +1,7 @@ +Some External Controlled Vocabulary scripts/configurations, when used on a metadata field that is single-valued could result +in indexing failure for the dataset (e.g. when the script tried to index both the identifier and name of the identified entity for indexing). +Dataverse has been updated to correctly indicate the need for a multi-valued Solr field in these cases in the call to /api/admin/index/solr/schema. +Configuring the Solr schema and the update-fields.sh script as usually recommended when using custom metadata blocks will resolve the issue. + +The overall release notes should include a Solr update (which hopefully is required by an update to 9.7.0 anyway) and our standard instructions +should change to recommending use of the update-fields.sh script when using custom metadatablocks *and/or external vocabulary scripts*. diff --git a/doc/sphinx-guides/source/admin/metadatacustomization.rst b/doc/sphinx-guides/source/admin/metadatacustomization.rst index e5326efebef..2a104354af9 100644 --- a/doc/sphinx-guides/source/admin/metadatacustomization.rst +++ b/doc/sphinx-guides/source/admin/metadatacustomization.rst @@ -559,8 +559,7 @@ Using External Vocabulary Services The Dataverse software has a mechanism to associate specific fields defined in metadata blocks with a vocabulary(ies) managed by external services. The mechanism relies on trusted third-party Javascripts. The mapping from field type to external vocabulary(ies) is managed via the :ref:`:CVocConf <:CVocConf>` setting. -*This functionality is considered 'experimental'. It may require significant effort to configure and is likely to evolve in subsequent Dataverse software releases.* - +*This functionality may require significant effort to configure and is likely to evolve in subsequent Dataverse software releases.* The effect of configuring this mechanism is similar to that of defining a field in a metadata block with 'allowControlledVocabulary=true': @@ -585,6 +584,9 @@ Configuration involves specifying which fields are to be mapped, to which Solr f These are all defined in the :ref:`:CVocConf <:CVocConf>` setting as a JSON array. Details about the required elements as well as example JSON arrays are available at https://github.com/gdcc/dataverse-external-vocab-support, along with an example metadata block that can be used for testing. The scripts required can be hosted locally or retrieved dynamically from https://gdcc.github.io/ (similar to how dataverse-previewers work). +Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), +updating the Solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. + Please note that in addition to the :ref:`:CVocConf` described above, an alternative is the :ref:`:ControlledVocabularyCustomJavaScript` setting. Protecting MetadataBlocks diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 30a36da9499..a653a100c89 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4653,6 +4653,9 @@ The commands below should give you an idea of how to load the configuration, but ``curl -X PUT --upload-file cvoc-conf.json http://localhost:8080/api/admin/settings/:CVocConf`` +Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), +updating the Solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. + .. _:ControlledVocabularyCustomJavaScript: :ControlledVocabularyCustomJavaScript diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index c30a77acb58..bc9a8ae692b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -44,6 +44,7 @@ import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; @@ -451,11 +452,11 @@ public Response clearOrphans(@QueryParam("sync") String sync) { public String getSolrSchema() { StringBuilder sb = new StringBuilder(); - - for (DatasetFieldType datasetField : datasetFieldService.findAllOrderedByName()) { + Map cvocTermUriMap = datasetFieldSvc.getCVocConf(true); + for (DatasetFieldType datasetFieldType : datasetFieldService.findAllOrderedByName()) { //ToDo - getSolrField() creates/returns a new object - just get it once and re-use - String nameSearchable = datasetField.getSolrField().getNameSearchable(); - SolrField.SolrType solrType = datasetField.getSolrField().getSolrType(); + String nameSearchable = datasetFieldType.getSolrField().getNameSearchable(); + SolrField.SolrType solrType = datasetFieldType.getSolrField().getSolrType(); String type = solrType.getType(); if (solrType.equals(SolrField.SolrType.EMAIL)) { /** @@ -474,7 +475,7 @@ public String getSolrSchema() { */ logger.info("email type detected (" + nameSearchable + ") See also https://github.com/IQSS/dataverse/issues/759"); } - String multivalued = datasetField.getSolrField().isAllowedToBeMultivalued().toString(); + String multivalued = Boolean.toString(datasetFieldType.getSolrField().isAllowedToBeMultivalued() || cvocTermUriMap.containsKey(datasetFieldType.getId())); // sb.append(" \n"); }