Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Making languages dynamic with command arguments #125

Merged
merged 24 commits into from
Dec 20, 2014
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ dependency-reduced-pom.xml
*.~*

__pycache__

nb-configuration.xml

nbactions.xml
58 changes: 57 additions & 1 deletion es/mappings.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@
"copy_to": [
"collector.it"
]
},
"nl": {
"type": "string",
"index": "no",
"copy_to": [
"collector.nl"
]
}
}
},
Expand Down Expand Up @@ -121,6 +128,13 @@
"copy_to": [
"collector.it"
]
},
"nl": {
"type": "string",
"index": "no",
"copy_to": [
"collector.nl"
]
}
}
},
Expand Down Expand Up @@ -160,6 +174,13 @@
"copy_to": [
"collector.it"
]
},
"nl": {
"type": "string",
"index": "no",
"copy_to": [
"collector.nl"
]
}
}
},
Expand Down Expand Up @@ -243,7 +264,8 @@
"name.en",
"name.de",
"name.fr",
"name.it"
"name.it",
"name.nl"
]
},
"en": {
Expand Down Expand Up @@ -296,6 +318,23 @@
"copy_to": [
"collector.it"
]
},
"nl": {
"type": "string",
"index": "no",
"fields": {
"ngrams": {
"type": "string",
"index_analyzer": "index_ngram"
},
"raw": {
"type": "string",
"index_analyzer": "index_raw"
}
},
"copy_to": [
"collector.nl"
]
}
}
},
Expand Down Expand Up @@ -336,6 +375,13 @@
"copy_to": [
"collector.it"
]
},
"nl": {
"index": "no",
"type": "string",
"copy_to": [
"collector.nl"
]
}
}
},
Expand Down Expand Up @@ -391,6 +437,16 @@
"analyzer": "index_raw"
}
}
},
"nl": {
"type": "string",
"analyzer": "index_ngram",
"fields": {
"raw": {
"type": "string",
"analyzer": "index_raw"
}
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions es/query_location_bias.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@
"script_score": {
"script": "dist = doc['coordinate'].distanceInKm(lat, lon); 0.5 + ( 1.5 / (1.0 + dist/40.0) )",
"params": {
"lat": ${lat},
"lon": ${lon}
"lat": "${lat}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cast floats to strings, not sure we really want to do that. What was your rationale here (apart from making the json a vaild one ;) )?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Woops! That is an oversight on my part.. Did not realize it would be casted as string. Will revert this change

"lon": "${lon}"
}
}
}
Expand Down
13 changes: 7 additions & 6 deletions src/main/java/de/komoot/photon/importer/App.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public static void main(String[] rawArgs) {
if(args.getJsonDump() != null) {
try {
final String filename = args.getJsonDump();
final JsonDumper jsonDumper = new JsonDumper(filename);
final JsonDumper jsonDumper = new JsonDumper(filename, args.getNominatimImportLanguages());
NominatimConnector nominatimConnector = new NominatimConnector(args.getHost(), args.getPort(), args.getDatabase(), args.getUser(), args.getPassword());
nominatimConnector.setImporter(jsonDumper);
nominatimConnector.readEntireDatabase();
Expand All @@ -65,16 +65,17 @@ public static void main(String[] rawArgs) {

if(args.isNominatimImport()) {
esServer.recreateIndex(); // dump previous data
Importer importer = new Importer(esNodeClient);
log.info("starting import from nominatim to photon with languages: " + args.getNominatimImportLanguages());
Importer importer = new Importer(esNodeClient, args.getNominatimImportLanguages());
NominatimConnector nominatimConnector = new NominatimConnector(args.getHost(), args.getPort(), args.getDatabase(), args.getUser(), args.getPassword());
nominatimConnector.setImporter(importer);
nominatimConnector.readEntireDatabase();
log.info("imported data from nominatim to photon.");
log.info("imported data from nominatim to photon with languages: " + args.getNominatimImportLanguages());
return;
}

final NominatimUpdater nominatimUpdater = new NominatimUpdater(args.getHost(), args.getPort(), args.getDatabase(), args.getUser(), args.getPassword());
de.komoot.photon.importer.Updater updater = new de.komoot.photon.importer.elasticsearch.Updater(esNodeClient);
de.komoot.photon.importer.Updater updater = new de.komoot.photon.importer.elasticsearch.Updater(esNodeClient, args.getNominatimImportLanguages());
nominatimUpdater.setUpdater(updater);

startApi(args, esNodeClient, nominatimUpdater);
Expand All @@ -99,7 +100,7 @@ public void run() {
});

final Searcher searcher = new Searcher(esNodeClient);
get(new RequestHandler("api", searcher));
get(new RequestHandler("api/", searcher));
get(new RequestHandler("api", searcher, args.getSupportedLanguages()));
get(new RequestHandler("api/", searcher, args.getSupportedLanguages()));
}
}
6 changes: 6 additions & 0 deletions src/main/java/de/komoot/photon/importer/CommandLineArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ public class CommandLineArgs {

@Parameter(names = "-nominatim-import", description = "import nominatim database into photon (this will delete previous index)")
private boolean nominatimImport = false;

@Parameter(names = "-nominatim-import-languages", description = "languages nominatim importer should import, comma seperated (default: 'en,fr,de,it')")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we unify both arguments? could be cleaner

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea. I was struggling with a unified name, what do you suggest? Something like '-using-languages' or just '-supported-languages' maybe?

private String nominatimImportLanguages = "en,fr,de,it";

@Parameter(names = "-supported-languages", description = "languages that are supported at run-time, comma seperated (default: 'en,fr,de,it')")
private String supportedLanguages = "en,fr,de,it";

@Parameter(names = "-json", description = "import nominatim database and dump it to a json like files in (useful for developing)")
private String jsonDump = null;
Expand Down
10 changes: 6 additions & 4 deletions src/main/java/de/komoot/photon/importer/RequestHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import com.google.common.base.Joiner;
import de.komoot.photon.importer.elasticsearch.Searcher;
import org.elasticsearch.common.collect.ImmutableSet;
import java.util.Arrays;
import java.util.HashSet;
import org.json.JSONArray;
import org.json.JSONObject;
import spark.Request;
Expand All @@ -19,11 +20,12 @@
*/
public class RequestHandler extends Route {
private final Searcher searcher;
private static final Set<String> supportedLanguages = ImmutableSet.of("de", "en", "fr", "it");

protected RequestHandler(String path, Searcher searcher) {
private final Set<String> supportedLanguages;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see spaces here, while the lines around have tabs, but I'm not sure what's the choice here. @christophlingg? :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably a formatting setting of the Netbeans config I use. I will change it to tabs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spaces still seem to be there ;)

protected RequestHandler(String path, Searcher searcher, String languages) {
super(path);
this.searcher = searcher;
this.supportedLanguages = new HashSet<String>(Arrays.asList(languages.split(",")));
}

@Override
Expand Down
44 changes: 29 additions & 15 deletions src/main/java/de/komoot/photon/importer/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
public class Utils {
static final Joiner commaJoiner = Joiner.on(", ").skipNulls();

final static String[] languages = new String[]{"de", "en", "fr", "it"};
//final static String[] languages = new String[]{"de", "en", "fr", "it"};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't need this line, let's remove it :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching that, a bit sloppy of me


public static XContentBuilder convert(PhotonDoc doc) throws IOException {
public static XContentBuilder convert(PhotonDoc doc, String[] languages) throws IOException {
XContentBuilder builder = XContentFactory.jsonBuilder().startObject()
.field(Tags.KEY_OSM_ID, doc.getOsmId())
.field(Tags.KEY_OSM_TYPE, doc.getOsmType())
Expand All @@ -47,11 +47,11 @@ public static XContentBuilder convert(PhotonDoc doc) throws IOException {
builder.field("postcode", doc.getPostcode());
}

writeName(builder, doc.getName());
writeIntlNames(builder, doc.getCity(), "city");
writeIntlNames(builder, doc.getCountry(), "country");
writeIntlNames(builder, doc.getStreet(), "street");
writeContext(builder, doc.getContext());
writeName(builder, doc.getName(), languages);
writeIntlNames(builder, doc.getCity(), "city", languages);
writeIntlNames(builder, doc.getCountry(), "country", languages);
writeIntlNames(builder, doc.getStreet(), "street", languages);
writeContext(builder, doc.getContext(), languages);
writeExtent(builder, doc.getBbox());

return builder;
Expand All @@ -74,8 +74,8 @@ private static void writeExtent(XContentBuilder builder, Envelope bbox) throws I
builder.endObject();
}

private static void writeName(XContentBuilder builder, Map<String, String> name) throws IOException {
Map<String, String> fNames = filterNames(name);
private static void writeName(XContentBuilder builder, Map<String, String> name, String[] languages) throws IOException {
Map<String, String> fNames = filterNames(name, languages);

if(name.get("alt_name") != null)
fNames.put("alt", name.get("alt_name"));
Expand All @@ -102,7 +102,7 @@ private static void write(XContentBuilder builder, Map<String, String> fNames, S
builder.endObject();
}

protected static void writeContext(XContentBuilder builder, Set<Map<String, String>> contexts) throws IOException {
protected static void writeContext(XContentBuilder builder, Set<Map<String, String>> contexts, String[] languages) throws IOException {
final SetMultimap<String, String> multimap = HashMultimap.create();

for(Map<String, String> context : contexts) {
Expand All @@ -129,16 +129,16 @@ protected static void writeContext(XContentBuilder builder, Set<Map<String, Stri
}
}

private static void writeIntlNames(XContentBuilder builder, Map<String, String> names, String name) throws IOException {
Map<String, String> fNames = filterNames(names);
private static void writeIntlNames(XContentBuilder builder, Map<String, String> names, String name, String[] languages) throws IOException {
Map<String, String> fNames = filterNames(names, languages);
write(builder, fNames, name);
}

private static Map<String, String> filterNames(Map<String, String> names) {
return filterNames(names, new HashMap<String, String>());
private static Map<String, String> filterNames(Map<String, String> names, String[] languages) {
return filterNames(names, new HashMap<String, String>(), languages);
}

private static Map<String, String> filterNames(Map<String, String> names, HashMap<String, String> filteredNames) {
private static Map<String, String> filterNames(Map<String, String> names, HashMap<String, String> filteredNames, String[] languages) {
if(names == null) return filteredNames;

if(names.get("name") != null) {
Expand All @@ -153,4 +153,18 @@ private static Map<String, String> filterNames(Map<String, String> names, HashMa

return filteredNames;
}

// http://stackoverflow.com/a/4031040/1437096
public static String stripNonDigits(
final CharSequence input /* inspired by seh's comment */){
final StringBuilder sb = new StringBuilder(
input.length() /* also inspired by seh's comment */);
for(int i = 0; i < input.length(); i++){
final char c = input.charAt(i);
if(c > 47 && c < 58){
sb.append(c);
}
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,19 @@ public class Importer implements de.komoot.photon.importer.Importer {
private final String indexType = "place";
private final Client esClient;
private BulkRequestBuilder bulkRequest;
private String[] languages;

public Importer(Client esClient) {
public Importer(Client esClient, String languages) {
this.esClient = esClient;
this.bulkRequest = esClient.prepareBulk();
this.languages = languages.split(",");
}

@Override
public void add(PhotonDoc doc) {
try {
this.bulkRequest.add(this.esClient.prepareIndex(indexName, indexType).
setSource(Utils.convert(doc)).setId(String.valueOf(doc.getPlaceId())));
setSource(Utils.convert(doc, languages)).setId(String.valueOf(doc.getPlaceId())));
} catch(IOException e) {
log.error("could not ", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import de.komoot.photon.importer.Tags;
import de.komoot.photon.importer.osm.OSMTags;
import lombok.extern.slf4j.Slf4j;
import de.komoot.photon.importer.Utils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.text.StrSubstitutor;
Expand Down Expand Up @@ -66,14 +67,14 @@ public List<JSONObject> search(String query, String lang, Double lon, Double lat

SearchResponse response = client.prepareSearch("photon").setSearchType(SearchType.QUERY_AND_FETCH).setQuery(query).setSize(limit).setTimeout(TimeValue.timeValueSeconds(7)).execute().actionGet();
List<JSONObject> results = convert(response.getHits().getHits(), lang);
results = removeStreetDuplicates(results);
results = removeStreetDuplicates(results, lang);
if(results.size() > limit) {
results = results.subList(0, limit);
}
return results;
}

private List<JSONObject> removeStreetDuplicates(List<JSONObject> results) {
private List<JSONObject> removeStreetDuplicates(List<JSONObject> results, String lang) {
List<JSONObject> filteredItems = Lists.newArrayListWithCapacity(results.size());
final HashSet<String> keys = Sets.newHashSet();
for(JSONObject result : results) {
Expand All @@ -84,12 +85,35 @@ private List<JSONObject> removeStreetDuplicates(List<JSONObject> results) {
// street has a postcode and name
String postcode = properties.getString(OSMTags.KEY_POSTCODE);
String name = properties.getString(OSMTags.KEY_NAME);
String key = postcode + ":" + name;
String key = postcode + ":" + name;

if(keys.contains(key)) {
// a street with this name + postcode is already part of the result list
continue;
}
} else if (lang.equals("nl")) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should better land in a separate PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I wasn't sure about having this in the same PR. Because it is dependent of the previous changes from this PR, I thought I would add it here. Will submit this as a separate PR 👍

// check for dutch postcodes (i.e. 6532RA). If a street has the same name and the same 4 numbers in the postcode,
// we can assume it is part of the same street, so only use the first
try {
String letterlessPostcode = Utils.stripNonDigits(postcode);
int postcodeNumbers = Integer.parseInt(letterlessPostcode);
boolean foundMatch = false;

for (String keyString : keys) {
String letterlessKey = Utils.stripNonDigits(keyString);
// also check if name equals,
// which is a safety check for streets that partially match and have the same postcode numbers
String keyName = keyString.split(":")[1];
if (postcodeNumbers == Integer.parseInt(letterlessKey) && keyName.equals(name)) {
foundMatch = true;
break;
}
}

if (foundMatch) {
continue;
}
} catch (NumberFormatException e) {}
}
keys.add(key);
}
}
Expand Down
Loading