diff --git a/CHANGELOG.md b/CHANGELOG.md index ded9553f88de5..2585e7fbb6816 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,6 +86,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Always use `constant_score` query for `match_only_text` field ([#16964](https://github.com/opensearch-project/OpenSearch/pull/16964)) - Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868)) - Fix multi-value sort for unsigned long ([#16732](https://github.com/opensearch-project/OpenSearch/pull/16732)) +- The `phone-search` analyzer no longer emits the tel/sip prefix, international calling code, extension numbers and unformatted input as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993)) ### Security diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java index 6b95594204eb4..e0541755a2b3e 100644 --- a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java @@ -98,7 +98,9 @@ private Set getTokens() throws IOException { // Rip off the "tel:" or "sip:" prefix if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { - tokens.add(input.substring(0, 4)); + if (addNgrams) { + tokens.add(input.substring(0, 4)); + } input = input.substring(4); } @@ -128,14 +130,23 @@ private Set getTokens() throws IOException { countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); input = String.valueOf(numberProto.getNationalNumber()); - // Add Country code, extension, and the number as tokens - tokens.add(countryCode.get()); + // add full number as tokens tokens.add(countryCode.get() + input); - if (!Strings.isEmpty(numberProto.getExtension())) { - tokens.add(numberProto.getExtension()); + + if (addNgrams) { + // Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the + // same country code + tokens.add(countryCode.get()); + + // Add extension without country code (not done for search analyzer as that might match numbers in other countries as + // well!) + if (!Strings.isEmpty(numberProto.getExtension())) { + tokens.add(numberProto.getExtension()); + } + // Add unformatted input (most likely the same as the extension now since the prefix has been removed) + tokens.add(input); } - tokens.add(input); } } catch (final NumberParseException | StringIndexOutOfBoundsException e) { // Libphone didn't like it, no biggie. We'll just ngram the number as it is. diff --git a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java index 332f6d21f47d6..503cee9cc710f 100644 --- a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java +++ b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java @@ -87,11 +87,7 @@ public void testEuropeDetailled() throws IOException { * Test for all tokens which are emitted by the "phone" analyzer. */ public void testEuropeDetailledSearch() throws IOException { - assertTokensAreInAnyOrder( - phoneSearchAnalyzer, - "tel:+441344840400", - Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400") - ); + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+441344840400", Arrays.asList("tel:+441344840400", "441344840400")); } public void testEurope() throws IOException { @@ -163,7 +159,11 @@ public void testSipWithoutDomainPart() throws IOException { } public void testTelPrefix() throws IOException { - assertTokensInclude("tel:+1228", Arrays.asList("1228", "122", "228")); + assertTokensInclude(phoneAnalyzer, "tel:+1228", Arrays.asList("tel:+1228", "tel:", "1228", "122", "228")); + } + + public void testTelPrefixSearch() throws IOException { + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+1228", Arrays.asList("tel:+1228", "1228")); } public void testNumberPrefix() throws IOException { @@ -189,21 +189,21 @@ public void testLocalNumberWithCH() throws IOException { } public void testSearchInternationalPrefixWithZZ() throws IOException { - assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010")); } public void testSearchInternationalPrefixWithCH() throws IOException { - assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010")); } public void testSearchNationalPrefixWithCH() throws IOException { // + is equivalent to 00 in Switzerland - assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("0041583161010", "41583161010")); } public void testSearchLocalNumberWithCH() throws IOException { // when omitting the international prefix swiss numbers must start with '0' - assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("0583161010", "41583161010")); } /** diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml index 0bd7d2c371bfc..1c51bfa3c5347 100644 --- a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml +++ b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml @@ -32,9 +32,37 @@ index: test id: 1 body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" } + - do: + index: + index: test + id: 2 + body: { "phone": "+41 58 316 99 99", "phone-ch": "058 316 99 99" } + - do: + index: + index: test + id: 3 + # number not used in the examples below, just present to make sure that it's never matched + body: { "phone": "+41 12 345 67 89", "phone-ch": "012 345 67 89" } + - do: + index: + index: test + id: 4 + # germany has a different phone number length, but for this test we ignore it and pretend they're the same + body: { "phone": "+49 58 316 10 10", "phone-ch": "+49 58 316 10 10" } + - do: + index: + index: test + id: 5 + body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" } + - do: + index: + index: test + id: 6 + body: { "phone": "tel:+441344840400", "phone-ch": "tel:+441344840400" } - do: indices.refresh: {} + # international format in document & search will always work - do: search: rest_total_hits_as_int: true @@ -45,6 +73,7 @@ "phone": "+41583161010" - match: { hits.total: 1 } + # correct national format & international format in search will always work - do: search: rest_total_hits_as_int: true @@ -54,3 +83,113 @@ match: "phone-ch": "+41583161010" - match: { hits.total: 1 } + + # national format without country specified won't work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "0583161010" + - match: { hits.total: 0 } + + # correct national format with country specified in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "0583161010" + - match: { hits.total: 1 } + + # search-as-you-type style query + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+4158316" + - match: { hits.total: 2 } + + # search-as-you-type style query + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "058316" + - match: { hits.total: 2 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+1 888 280 4331" + - match: { hits.total: 1 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "+1 888 280 4331" + - match: { hits.total: 1 } + + # national format in search won't work if no country is specified + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "888 280 4331" + - match: { hits.total: 0 } + + # document & search have a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "tel:+441344840400" + - match: { hits.total: 1 } + + # only document has a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+441344840400" + - match: { hits.total: 1 } + + # only search has a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "tel:+1 888 280 4331" + - match: { hits.total: 1 }