Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

phone-search analyzer: don't emit sip/tel prefix, int'l prefix, extension & unformatted input #16993

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Always use `constant_score` query for `match_only_text` field ([#16964](https://github.com/opensearch-project/OpenSearch/pull/16964))
- Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868))
- Fix multi-value sort for unsigned long ([#16732](https://github.com/opensearch-project/OpenSearch/pull/16732))
- The `phone-search` analyzer no longer emits the tel/sip prefix, international calling code, extension numbers and unformatted input as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ private Set<String> getTokens() throws IOException {

// Rip off the "tel:" or "sip:" prefix
if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) {
tokens.add(input.substring(0, 4));
if (addNgrams) {
tokens.add(input.substring(0, 4));
}
input = input.substring(4);
}

Expand Down Expand Up @@ -128,14 +130,23 @@ private Set<String> getTokens() throws IOException {
countryCode = Optional.of(String.valueOf(numberProto.getCountryCode()));
input = String.valueOf(numberProto.getNationalNumber());

// Add Country code, extension, and the number as tokens
tokens.add(countryCode.get());
// add full number as tokens
tokens.add(countryCode.get() + input);
if (!Strings.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());

if (addNgrams) {
// Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the
// same country code
tokens.add(countryCode.get());

// Add extension without country code (not done for search analyzer as that might match numbers in other countries as
// well!)
if (!Strings.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
}
// Add unformatted input (most likely the same as the extension now since the prefix has been removed)
tokens.add(input);
}

tokens.add(input);
}
} catch (final NumberParseException | StringIndexOutOfBoundsException e) {
// Libphone didn't like it, no biggie. We'll just ngram the number as it is.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,7 @@ public void testEuropeDetailled() throws IOException {
* Test for all tokens which are emitted by the "phone" analyzer.
*/
public void testEuropeDetailledSearch() throws IOException {
assertTokensAreInAnyOrder(
phoneSearchAnalyzer,
"tel:+441344840400",
Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400")
);
assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+441344840400", Arrays.asList("tel:+441344840400", "441344840400"));
}

public void testEurope() throws IOException {
Expand Down Expand Up @@ -166,6 +162,10 @@ public void testTelPrefix() throws IOException {
assertTokensInclude("tel:+1228", Arrays.asList("1228", "122", "228"));
}

public void testTelPrefixSearch() throws IOException {
assertTokensInclude("tel:+1228", Arrays.asList("1228"));
}

public void testNumberPrefix() throws IOException {
assertTokensInclude("+1228", Arrays.asList("1228", "122", "228"));
}
Expand All @@ -189,21 +189,21 @@ public void testLocalNumberWithCH() throws IOException {
}

public void testSearchInternationalPrefixWithZZ() throws IOException {
assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010"));
}

public void testSearchInternationalPrefixWithCH() throws IOException {
assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010"));
}

public void testSearchNationalPrefixWithCH() throws IOException {
// + is equivalent to 00 in Switzerland
assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("0041583161010", "41583161010"));
}

public void testSearchLocalNumberWithCH() throws IOException {
// when omitting the international prefix swiss numbers must start with '0'
assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("0583161010", "41583161010"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,37 @@
index: test
id: 1
body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" }
- do:
index:
index: test
id: 2
body: { "phone": "+41 58 316 99 99", "phone-ch": "058 316 99 99" }
- do:
index:
index: test
id: 3
# number not used in the examples below, just present to make sure that it's never matched
body: { "phone": "+41 12 345 67 89", "phone-ch": "012 345 67 89" }
- do:
index:
index: test
id: 4
# germany has a different phone number length, but for this test we ignore it and pretend they're the same
body: { "phone": "+49 58 316 10 10", "phone-ch": "+49 58 316 10 10" }
- do:
index:
index: test
id: 5
body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" }
- do:
index:
index: test
id: 6
body: { "phone": "tel:+441344840400", "phone-ch": "tel:+441344840400" }
- do:
indices.refresh: {}

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
Expand All @@ -45,6 +73,7 @@
"phone": "+41583161010"
- match: { hits.total: 1 }

# correct national format & international format in search will always work
- do:
search:
rest_total_hits_as_int: true
Expand All @@ -54,3 +83,113 @@
match:
"phone-ch": "+41583161010"
- match: { hits.total: 1 }

# national format without country specified won't work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "0583161010"
- match: { hits.total: 0 }

# correct national format with country specified in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "0583161010"
- match: { hits.total: 1 }

# search-as-you-type style query
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "+4158316"
- match: { hits.total: 2 }

# search-as-you-type style query
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "058316"
- match: { hits.total: 2 }

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "+1 888 280 4331"
- match: { hits.total: 1 }

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "+1 888 280 4331"
- match: { hits.total: 1 }

# national format in search won't work if no country is specified
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "888 280 4331"
- match: { hits.total: 0 }

# document & search have a tel: prefix
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "tel:+441344840400"
- match: { hits.total: 1 }

# only document has a tel: prefix
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "+441344840400"
- match: { hits.total: 1 }

# only search has a tel: prefix
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "tel:+1 888 280 4331"
- match: { hits.total: 1 }
Loading