Skip to content

Commit

Permalink
Iso19115 indexer, major refactoring and new test cases added (#1317)
Browse files Browse the repository at this point in the history
  • Loading branch information
shaneding authored Jul 13, 2020
1 parent 9b696a0 commit fad12be
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 45 deletions.
93 changes: 50 additions & 43 deletions src/main/java/io/anserini/collection/Iso19115Collection.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,68 +86,75 @@ public void readNext() throws NoSuchElementException {
public static class Document implements SourceDocument{
protected String id;
protected String title;
static final String[] titlePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:title",
"gco:CharacterString"};
protected String abstractContent;
static final String[] abstractContentPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"};
protected String raw;
protected String organisation;
static final String[] organisationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:abstract", "gco:CharacterString"};
protected String[] responsibleParty;
static final String[] responsiblePartyPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation", "gmd:citedResponsibleParty"};
protected String catalogue;
static final String[] cataloguePath = {"gmd:MD_Metadata", "gmd:contact", "gmd:CI_ResponsibleParty", "gmd:individualName", "gco:CharacterString"};
protected String publish_time;
static final String[] publish_timePath = {"gmd:MD_Metadata", "gmd:dateStamp", "gco:Date"};
protected String url;
static final String[] urlPath = {"gmd:MD_Metadata", "gmd:dataSetURI", "gco:CharacterString"};
protected double[] latitude;
protected double[] longitude;
protected String coordinates;
// new entried added
static final String[] coordinatePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:extent", "gmd:EX_Extent", "gmd:geographicElement",
"gmd:EX_GeographicBoundingBox"};
protected String purpose;
static final String[] purposePath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:purpose", "gco:CharacterString"};
protected String supplInfo;
static final String[] supplInfoPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:supplementalInformation", "gco:CharacterString"};
protected String topicCategory;
static final String[] topicCategoryPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:topicCategory", "gmd:MD_TopicCategoryCode"};
protected String[] keywords;
static final String[] keywordPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"};
protected String recommendedCitation;
static final String[] recommendedCitationPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:citation", "gmd:CI_Citation",
"gmd:otherCitationDetails", "gco:CharacterString"};
protected String thesaurusName;
static final String[] theasurusNameMainPath = {"gmd:MD_Metadata", "gmd:identificationInfo", "gmd:MD_DataIdentification", "gmd:descriptiveKeywords"};
static final String[] theasurusNameSubPath = {"gmd:MD_Keywords", "gmd:thesaurusName", "gmd:CI_Citation"};

public Document(JsonNode json) {
// extracting the fields from the ISO19115 file
this.raw = json.toString();
String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText();
// extracting the id in the beginning of the text
this.id = identifier.substring(0,identifier.length() - 8);
this.title = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation")
.get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText();
this.abstractContent = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification")
.get("gmd:abstract").get("gco:CharacterString").asText();
this.organisation = json.get("gmd:MD_Metadata").get("gmd:contact").get("gmd:CI_ResponsibleParty").get("gmd:organisationName")
.get("gco:CharacterString").asText();
this.catalogue = json.get("gmd:MD_Metadata").get("gmd:contact").get("gmd:CI_ResponsibleParty").get("gmd:individualName")
.get("gco:CharacterString").asText();
this.publish_time = json.get("gmd:MD_Metadata").get("gmd:dateStamp").get("gco:Date").asText();
this.url = json.get("gmd:MD_Metadata").get("gmd:dataSetURI").get("gco:CharacterString").asText();
this.purpose = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:purpose")
.get("gco:CharacterString").asText();
this.supplInfo = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:supplementalInformation")
.get("gco:CharacterString").asText();
this.topicCategory = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:topicCategory")
.get("gmd:MD_TopicCategoryCode").asText();
this.recommendedCitation = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation")
.get("gmd:CI_Citation").get("gmd:otherCitationDetails").get("gco:CharacterString").asText();
this.thesaurusName = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords")
.get(0).get("gmd:MD_Keywords").get("gmd:thesaurusName").get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText()

this.title = extractNode(titlePath, json).asText();
this.abstractContent = extractNode(abstractContentPath, json).asText();
this.organisation = extractNode(organisationPath, json).asText();
this.catalogue = extractNode(cataloguePath, json).asText();
this.publish_time = extractNode(publish_timePath, json).asText();
this.url = extractNode(urlPath, json).asText();
this.purpose = extractNode(purposePath, json).asText();
this.supplInfo = extractNode(supplInfoPath, json).asText();
this.topicCategory = extractNode(topicCategoryPath, json).asText();
this.recommendedCitation = extractNode(recommendedCitationPath, json).asText();

JsonNode mainThesaurusNode = extractNode(theasurusNameMainPath, json).get(0);
this.thesaurusName = extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:title").get("gco:CharacterString").asText()
+ " : " +
json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords")
.get(0).get("gmd:MD_Keywords").get("gmd:thesaurusName").get("gmd:CI_Citation").get("gmd:otherCitationDetails").get("gco:CharacterString")
.asText();

// extracting all the authors of the paper
JsonNode parties_node = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation")
.get("gmd:CI_Citation").get("gmd:citedResponsibleParty");
// extracting individual authors from the ResponsibleParty field
extractNode(theasurusNameSubPath, mainThesaurusNode).get("gmd:otherCitationDetails").get("gco:CharacterString").asText();

// extracting all the responsible parties of the paper
JsonNode parties_node = extractNode(responsiblePartyPath, json);
// extracting individual parties from the ResponsibleParty field
int number_of_parties = parties_node.size();
responsibleParty = new String[number_of_parties];
for(int i=0; i < number_of_parties; i++){
responsibleParty[i] = parties_node.get(i).get("gmd:CI_ResponsibleParty").get("gmd:individualName").get("gco:CharacterString").asText();
}

// extracting all the keywords of the paper
JsonNode keyword_node = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:descriptiveKeywords")
.get(0).get("gmd:MD_Keywords").get("gmd:keyword");
JsonNode keyword_node = extractNode(keywordPath, json).get(0).get("gmd:MD_Keywords").get("gmd:keyword");
// extracting individual keyword from the keyword field
int number_of_keywords = keyword_node.size();
keywords = new String[number_of_keywords];
Expand All @@ -158,10 +165,9 @@ public Document(JsonNode json) {

// extracting the latitudes from the paper, 5 points as the polygon needs to be enclosed
latitude = new double[4];
latitude[0] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent")
.get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:northBoundLatitude").get("gco:Decimal").asDouble();
latitude[2] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent")
.get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:southBoundLatitude").get("gco:Decimal").asDouble();
JsonNode coordinateNode = extractNode(coordinatePath, json);
latitude[0] = coordinateNode.get("gmd:northBoundLatitude").get("gco:Decimal").asDouble();
latitude[2] = coordinateNode.get("gmd:southBoundLatitude").get("gco:Decimal").asDouble();
// ensuring that a single coordinate location will be drawn as a small rectangle
if (latitude[0] == latitude[2]) {
latitude[0] -= 0.01;
Expand All @@ -172,10 +178,8 @@ public Document(JsonNode json) {

// extracting the longitudes from the paper, again 5 points are needed to enclose the polygon
longitude = new double[4];
longitude[0] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent")
.get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:westBoundLongitude").get("gco:Decimal").asDouble();
longitude[1] = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:extent").get("gmd:EX_Extent")
.get("gmd:geographicElement").get("gmd:EX_GeographicBoundingBox").get("gmd:eastBoundLongitude").get("gco:Decimal").asDouble();
longitude[0] = coordinateNode.get("gmd:westBoundLongitude").get("gco:Decimal").asDouble();
longitude[1] = coordinateNode.get("gmd:eastBoundLongitude").get("gco:Decimal").asDouble();
// ensuring that a single coordinate location will be drawn as a small rectangle
if (longitude[0] == longitude[1]) {
longitude[0] -= 0.01;
Expand All @@ -187,6 +191,13 @@ public Document(JsonNode json) {
this.coordinates = getCoordinateString();
}

private JsonNode extractNode(String[] nodeNames, JsonNode json) {
for (String node: nodeNames) {
json = json.get(node);
}
return json;
}

public String getTitle() {
return title;
}
Expand Down Expand Up @@ -258,10 +269,6 @@ private String getCoordinateString() {
return coordinates.toString();
}

// public double[] getLatitude() { return latitude; }

// public double[] getLongitude() { return longitude; }

@Override
public String id() {
return id;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,20 @@ public void setUp() throws Exception {

totalSegments = 1;
totalDocs = 2;
expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract"));
expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2"));
expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract", "coordinates", "[[43.862008,-80.7178777],[43.862008,-80.272744],[43.6764444,-80.272744],[43.6764444,-80.7178777]]",
"thesaurusName", "Polar Data Catalogue Thesaurus (Canada) : https://www.polardata.ca/pdcinput/public/keywordlibrary"));
expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2", "coordinates", "[[43.452,-80.634],[43.452,-80.49],[43.313,-80.49],[43.313,-80.634]]",
"thesaurusName", "Polar Data Catalogue Thesaurus (Canada) : https://www.polardata.ca/pdcinput/public/keywordlibrary"));
}

@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
// System.out.println(((Iso19115Collection.Document) doc).getThesaurusName());
assertTrue(doc.indexable());
assertEquals(expected.get("id"), doc.id());
assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle());
assertEquals(expected.get("abstract"), ((Iso19115Collection.Document) doc).getAbstract());
assertEquals(expected.get("coordinates"), ((Iso19115Collection.Document) doc).getCoordinates());
assertEquals(expected.get("thesaurusName"), ((Iso19115Collection.Document) doc).getThesaurusName());
}
}

0 comments on commit fad12be

Please sign in to comment.