diff --git a/src/main/java/io/anserini/collection/Iso19115Collection.java b/src/main/java/io/anserini/collection/Iso19115Collection.java index b0a640694a..5e68d2734d 100644 --- a/src/main/java/io/anserini/collection/Iso19115Collection.java +++ b/src/main/java/io/anserini/collection/Iso19115Collection.java @@ -86,23 +86,40 @@ public void readNext() throws NoSuchElementException { public static class Document implements SourceDocument{ protected String id; protected String title; + static final String[] titlePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:title", + "gco:CharacterString"}; protected String abstractContent; + static final String[] abstractContentPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"}; protected String raw; protected String organisation; + static final String[] organisationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"}; protected String[] responsibleParty; + static final String[] responsiblePartyPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:citedResponsibleParty"}; protected String catalogue; + static final String[] cataloguePath = {"gmd:MD_Metadata", "gmd:contact", "gmd:CI_ResponsibleParty", "gmd:individualName", "gco:CharacterString"}; protected String publish_time; + static final String[] publish_timePath = {"gmd:MD_Metadata", "gmd:dateStamp", "gco:Date"}; protected String url; + static final String[] urlPath = {"gmd:MD_Metadata", "gmd:dataSetURI", "gco:CharacterString"}; protected double[] latitude; protected double[] longitude; protected String coordinates; - // new entried added + static final String[] coordinatePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:extent", "gmd:EX_Extent", "gmd:geographicElement", + "gmd:EX_GeographicBoundingBox"}; protected String purpose; + static final String[] purposePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:purpose", "gco:CharacterString"}; protected String supplInfo; + static final String[] supplInfoPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:supplementalInformation", "gco:CharacterString"}; protected String topicCategory; + static final String[] topicCategoryPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:topicCategory", "gmd:MD_TopicCategoryCode"}; protected String[] keywords; + static final String[] keywordPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"}; protected String recommendedCitation; + static final String[] recommendedCitationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", + "gmd:otherCitationDetails", "gco:CharacterString"}; protected String thesaurusName; + static final String[] theasurusNameMainPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"}; + static final String[] theasurusNameSubPath = {"gmd:MD_Keywords", "gmd:thesaurusName", "gmd:CI_Citation"}; public Document(JsonNode json) { // extracting the fields from the ISO19115 file @@ -110,35 +127,26 @@ public Document(JsonNode json) { String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText(); // extracting the id in the beginning of the text this.id = identifier.substring(0,identifier.length() - 8); - this.title = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation") - .get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText(); - this.abstractContent = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification") - .get("gmd:abstract").get("gco:CharacterString").asText(); - this.organisation = json.get("gmd:MD_Metadata").get("gmd:contact").get("gmd:CI_ResponsibleParty").get("gmd:organisationName") - .get("gco:CharacterString").asText(); - this.catalogue = json.get("gmd:MD_Metadata").get("gmd:contact").get("gmd:CI_ResponsibleParty").get("gmd:individualName") - .get("gco:CharacterString").asText(); - this.publish_time = json.get("gmd:MD_Metadata").get("gmd:dateStamp").get("gco:Date").asText(); - this.url = json.get("gmd:MD_Metadata").get("gmd:dataSetURI").get("gco:CharacterString").asText(); - this.purpose = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:purpose") - .get("gco:CharacterString").asText(); - this.supplInfo = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:supplementalInformation") - .get("gco:CharacterString").asText(); - this.topicCategory = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:topicCategory") - .get("gmd:MD_TopicCategoryCode").asText(); - this.recommendedCitation = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation") - .get("gmd:CI_Citation").get("gmd:otherCitationDetails").get("gco:CharacterString").asText(); - this.thesaurusName = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords") - .get(0).get("gmd:MD_Keywords").get("gmd:thesaurusName").get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText() + + this.title = extractNode(titlePath, json).asText(); + this.abstractContent = extractNode(abstractContentPath, json).asText(); + this.organisation = extractNode(organisationPath, json).asText(); + this.catalogue = extractNode(cataloguePath, json).asText(); + this.publish_time = extractNode(publish_timePath, json).asText(); + this.url = extractNode(urlPath, json).asText(); + this.purpose = extractNode(purposePath, json).asText(); + this.supplInfo = extractNode(supplInfoPath, json).asText(); + this.topicCategory = extractNode(topicCategoryPath, json).asText(); + this.recommendedCitation = extractNode(recommendedCitationPath, json).asText(); + + JsonNode mainThesaurusNode = extractNode(theasurusNameMainPath, json).get(0); + this.thesaurusName = extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:title").get("gco:CharacterString").asText() + " : " + - json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords") - .get(0).get("gmd:MD_Keywords").get("gmd:thesaurusName").get("gmd:CI_Citation").get("gmd:otherCitationDetails").get("gco:CharacterString") - .asText(); - - // extracting all the authors of the paper - JsonNode parties_node = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation") - .get("gmd:CI_Citation").get("gmd:citedResponsibleParty"); - // extracting individual authors from the ResponsibleParty field + extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:otherCitationDetails").get("gco:CharacterString").asText(); + + // extracting all the responsible parties of the paper + JsonNode parties_node = extractNode(responsiblePartyPath, json); + // extracting individual parties from the ResponsibleParty field int number_of_parties = parties_node.size(); responsibleParty = new String[number_of_parties]; for(int i=0; i < number_of_parties; i++){ @@ -146,8 +154,7 @@ public Document(JsonNode json) { } // extracting all the keywords of the paper - JsonNode keyword_node = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords") - .get(0).get("gmd:MD_Keywords").get("gmd:keyword"); + JsonNode keyword_node = extractNode(keywordPath, json).get(0).get("gmd:MD_Keywords").get("gmd:keyword"); // extracting individual keyword from the keyword field int number_of_keywords = keyword_node.size(); keywords = new String[number_of_keywords]; @@ -158,10 +165,9 @@ public Document(JsonNode json) { // extracting the latitudes from the paper, 5 points as the polygon needs to be enclosed latitude = new double[4]; - latitude[0] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent") - .get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:northBoundLatitude").get("gco:Decimal").asDouble(); - latitude[2] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent") - .get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:southBoundLatitude").get("gco:Decimal").asDouble(); + JsonNode coordinateNode = extractNode(coordinatePath, json); + latitude[0] = coordinateNode.get("gmd:northBoundLatitude").get("gco:Decimal").asDouble(); + latitude[2] = coordinateNode.get("gmd:southBoundLatitude").get("gco:Decimal").asDouble(); // ensuring that a single coordinate location will be drawn as a small rectangle if (latitude[0] == latitude[2]) { latitude[0] -= 0.01; @@ -172,10 +178,8 @@ public Document(JsonNode json) { // extracting the longitudes from the paper, again 5 points are needed to enclose the polygon longitude = new double[4]; - longitude[0] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent") - .get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:westBoundLongitude").get("gco:Decimal").asDouble(); - longitude[1] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent") - .get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:eastBoundLongitude").get("gco:Decimal").asDouble(); + longitude[0] = coordinateNode.get("gmd:westBoundLongitude").get("gco:Decimal").asDouble(); + longitude[1] = coordinateNode.get("gmd:eastBoundLongitude").get("gco:Decimal").asDouble(); // ensuring that a single coordinate location will be drawn as a small rectangle if (longitude[0] == longitude[1]) { longitude[0] -= 0.01; @@ -187,6 +191,13 @@ public Document(JsonNode json) { this.coordinates = getCoordinateString(); } + private JsonNode extractNode(String[] nodeNames, JsonNode json) { + for (String node: nodeNames) { + json = json.get(node); + } + return json; + } + public String getTitle() { return title; } @@ -258,10 +269,6 @@ private String getCoordinateString() { return coordinates.toString(); } - // public double[] getLatitude() { return latitude; } - - // public double[] getLongitude() { return longitude; } - @Override public String id() { return id; diff --git a/src/test/java/io/anserini/collection/Iso19115CollectionTest.java b/src/test/java/io/anserini/collection/Iso19115CollectionTest.java index ed494154b3..37c55d5cb1 100644 --- a/src/test/java/io/anserini/collection/Iso19115CollectionTest.java +++ b/src/test/java/io/anserini/collection/Iso19115CollectionTest.java @@ -35,15 +35,20 @@ public void setUp() throws Exception { totalSegments = 1; totalDocs = 2; - expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract")); - expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2")); + expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract", "coordinates", "[[43.862008,-80.7178777],[43.862008,-80.272744],[43.6764444,-80.272744],[43.6764444,-80.7178777]]", + "thesaurusName", "Polar Data Catalogue Thesaurus (Canada) : https://www.polardata.ca/pdcinput/public/keywordlibrary")); + expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2", "coordinates", "[[43.452,-80.634],[43.452,-80.49],[43.313,-80.49],[43.313,-80.634]]", + "thesaurusName", "Polar Data Catalogue Thesaurus (Canada) : https://www.polardata.ca/pdcinput/public/keywordlibrary")); } @Override void checkDocument(SourceDocument doc, Map expected) { + // System.out.println(((Iso19115Collection.Document) doc).getThesaurusName()); assertTrue(doc.indexable()); assertEquals(expected.get("id"), doc.id()); assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle()); assertEquals(expected.get("abstract"), ((Iso19115Collection.Document) doc).getAbstract()); + assertEquals(expected.get("coordinates"), ((Iso19115Collection.Document) doc).getCoordinates()); + assertEquals(expected.get("thesaurusName"), ((Iso19115Collection.Document) doc).getThesaurusName()); } }