From 2cb45c1ad63c576f32a510f2db47a6c5cf3f6bdd Mon Sep 17 00:00:00 2001 From: missinglink Date: Sat, 8 Jan 2022 11:44:21 +0100 Subject: [PATCH] feat(post): add alias for compound street names with abbreviated generic --- post/_contractions_abbreviated.json | 23 +++++++++ post/seperable_street_names.js | 46 +++++++++--------- test/post/seperable_street_names.js | 72 +++++++++++++++++++++++++---- 3 files changed, 110 insertions(+), 31 deletions(-) create mode 100644 post/_contractions_abbreviated.json diff --git a/post/_contractions_abbreviated.json b/post/_contractions_abbreviated.json new file mode 100644 index 0000000..c69597a --- /dev/null +++ b/post/_contractions_abbreviated.json @@ -0,0 +1,23 @@ +{ + "DEU": { + "platz": "pl", + "markt": "mkt", + "straße": "str", + "strasse": "str" + }, + "CHE": { + "platz": "pl", + "markt": "mkt", + "straße": "str", + "strasse": "str" + }, + "AUT": { + "platz": "pl", + "markt": "mkt", + "straße": "str", + "strasse": "str" + }, + "NLD": { + "straat": "str" + } +} diff --git a/post/seperable_street_names.js b/post/seperable_street_names.js index 6a41d42..87a8dde 100644 --- a/post/seperable_street_names.js +++ b/post/seperable_street_names.js @@ -2,6 +2,7 @@ const _ = require('lodash'); const TARGET_LAYERS = [ 'street', 'address', 'intersection' ]; const expansions = require('./_expansions.json'); const contractions = require('./_contractions.json'); +const contractions_abbreviated = require('./_contractions_abbreviated.json'); function expand(str, mapping) { let tokens = str.split(' '); @@ -38,61 +39,55 @@ function contract(str, mapping) { function expandAllFields(doc, mapping){ // index expanded version of default name - const name = doc.getName('default'); - if (_.isString(name) && !_.isEmpty(name)) { + _.castArray(_.get(doc.name, 'default', [])).forEach(name => { const expanded = expand(name, mapping); if (_.isString(expanded) && !_.isEmpty(expanded) && (name !== expanded)) { doc.setNameAlias('default', expanded); } - } + }); // index expanded version of street name - const street = doc.getAddress('street'); - if (_.isString(street) && !_.isEmpty(street)) { + _.castArray(_.get(doc.address_parts, 'street', [])).forEach(street => { const expanded = expand(street, mapping); if (_.isString(expanded) && !_.isEmpty(expanded) && (street !== expanded)) { doc.setAddressAlias('street', expanded); } - } + }); // index expanded version of cross_street name - const cross_street = doc.getAddress('cross_street'); - if (_.isString(cross_street) && !_.isEmpty(cross_street)) { + _.castArray(_.get(doc.address_parts, 'cross_street', [])).forEach(cross_street => { const expanded = expand(cross_street, mapping); if (_.isString(expanded) && !_.isEmpty(expanded) && (cross_street !== expanded)) { doc.setAddressAlias('cross_street', expanded); } - } + }); } function contractAllFields(doc, mapping) { // index expanded version of default name - const name = doc.getName('default'); - if (_.isString(name) && !_.isEmpty(name)) { + _.castArray(_.get(doc.name, 'default', [])).forEach(name => { const contracted = contract(name, mapping); if (_.isString(contracted) && !_.isEmpty(contracted) && (name !== contracted)) { doc.setNameAlias('default', contracted); } - } + }); // index contracted version of street name - const street = doc.getAddress('street'); - if (_.isString(street) && !_.isEmpty(street)) { + _.castArray(_.get(doc.address_parts, 'street', [])).forEach(street => { const contracted = contract(street, mapping); if (_.isString(contracted) && !_.isEmpty(contracted) && (street !== contracted)) { doc.setAddressAlias('street', contracted); } - } + }); // index contracted version of cross_street name - const cross_street = doc.getAddress('cross_street'); - if (_.isString(cross_street) && !_.isEmpty(cross_street)) { + _.castArray(_.get(doc.address_parts, 'cross_street', [])).forEach(cross_street => { const contracted = contract(cross_street, mapping); if (_.isString(contracted) && !_.isEmpty(contracted) && (cross_street !== contracted)) { doc.setAddressAlias('cross_street', contracted); } - } + }); } function post(doc) { @@ -101,20 +96,27 @@ function post(doc) { if( !TARGET_LAYERS.includes( doc.getLayer() ) ) { return; } // detect document country code - let docCountryCode = _.get(doc, 'parent.country_a[0]'); + let docCountryCode = _.get(doc, 'parent.country_a[0]') || _.get(doc, 'parent.dependency_a[0]'); if( !_.isString(docCountryCode) || docCountryCode.length !== 3 ) { return; } + docCountryCode = docCountryCode.toUpperCase(); // expansions - let mapping_expansions = expansions[docCountryCode.toUpperCase()]; + const mapping_expansions = expansions[docCountryCode]; if( _.isObject( mapping_expansions ) ) { expandAllFields(doc, mapping_expansions); } // contractions - let mapping_contractions = contractions[docCountryCode.toUpperCase()]; + const mapping_contractions = contractions[docCountryCode]; if( _.isObject( mapping_contractions ) ) { contractAllFields(doc, mapping_contractions); } + + // abbreviated contractions + const mapping_contractions_abbr = contractions_abbreviated[docCountryCode]; + if (_.isObject(mapping_contractions_abbr)) { + contractAllFields(doc, mapping_contractions_abbr); + } } module.exports = { @@ -125,4 +127,4 @@ module.exports = { contractions: contractions, contract: contract, contractAllFields: contractAllFields -}; \ No newline at end of file +}; diff --git a/test/post/seperable_street_names.js b/test/post/seperable_street_names.js index c4041f6..a70d17c 100644 --- a/test/post/seperable_street_names.js +++ b/test/post/seperable_street_names.js @@ -1,11 +1,12 @@ - -var Document = require('../../Document'); -var ssn = require('../../post/seperable_street_names'); +const _ = require('lodash'); +const Document = require('../../Document'); +const ssn = require('../../post/seperable_street_names'); +const dedupe = require('../../post/deduplication'); module.exports.tests = {}; module.exports.tests.expand = function (test) { - test('expand DEU', function (t) { + test('expand DEU', function (t) { t.equals('Example Weg', ssn.expand('Examplew.', ssn.expansions.DEU) ); t.equals('Example Weg', ssn.expand('Exampleweg', ssn.expansions.DEU) ); t.equals('Example Quelle', ssn.expand('Exampleq.', ssn.expansions.DEU) ); @@ -180,17 +181,23 @@ module.exports.tests.functional = function (test) { // name aliases defined t.deepEqual(doc.getNameAliases('default'), [ - 'Example Straße & Cross Platz' + 'Example Straße & Cross Platz', + 'Examplestraße & Crossplatz', + 'Examplestr & Crosspl' ]); // street aliases defined t.deepEqual(doc.getAddressAliases('street'), [ 'Example Straße', + 'Examplestraße', + 'Examplestr' ]); // cross_street aliases defined t.deepEqual(doc.getAddressAliases('cross_street'), [ 'Cross Platz', + 'Crossplatz', + 'Crosspl' ]); t.end(); @@ -208,17 +215,20 @@ module.exports.tests.functional = function (test) { // name aliases defined t.deepEqual(doc.getNameAliases('default'), [ - 'Examplestraße & Crossplatz' + 'Examplestraße & Crossplatz', + 'Examplestr & Crosspl' ]); // street aliases defined t.deepEqual(doc.getAddressAliases('street'), [ 'Examplestraße', + 'Examplestr' ]); // cross_street aliases defined t.deepEqual(doc.getAddressAliases('cross_street'), [ 'Crossplatz', + 'Crosspl' ]); t.end(); @@ -235,12 +245,14 @@ module.exports.tests.functional = function (test) { // name aliases defined t.deepEqual(doc.getNameAliases('default'), [ - 'Eberswalderstraße' + 'Eberswalderstraße', + 'Eberswalderstr' ]); // street aliases defined t.deepEqual(doc.getAddressAliases('street'), [ - 'Eberswalderstraße' + 'Eberswalderstraße', + 'Eberswalderstr' ]); t.end(); @@ -259,17 +271,23 @@ module.exports.tests.functional = function (test) { // name aliases defined t.deepEqual(doc.getNameAliases('default'), [ 'Example Straße & Cross Platz', - 'Examplestraße & Crossplatz' + 'Examplestraße & Crossplatz', + 'Examplestraße & Crossplatz', + 'Examplestr & Crossplatz', + 'Examplestr & Crosspl' ]); // street aliases defined t.deepEqual(doc.getAddressAliases('street'), [ 'Examplestraße', + 'Examplestr' ]); // cross_street aliases defined t.deepEqual(doc.getAddressAliases('cross_street'), [ 'Cross Platz', + 'Crossplatz', + 'Crosspl' ]); t.end(); @@ -292,6 +310,42 @@ module.exports.tests.functional = function (test) { t.doesNotThrow(() => ssn.post(doc)); t.end(); }); + + test('germanic separable street names', function (t) { + + let generate = (input) => { + var doc = new Document('mysource', 'street', 'myid'); + doc.addParent('country', 'Germany', '1001', 'DEU'); + doc.setName('default', input); + doc.setAddress('street', input); + doc.setAddress('cross_street', input); + ssn.post(doc); + dedupe(doc); + + return doc; + }; + + // test all permutations expand to all forms + // Separated / Compounded + Abbreviated / Compounded Non-Abbreviated + // note: Separated tokens are easily handled by elasticsearch synonyms + // and so do not require explicit substiution here. + t.deepEqual(_.castArray(generate('Foostrasse').name.default), ['Foostrasse', 'Foo Strasse', 'Foostr']); + t.deepEqual(_.castArray(generate('Foostraße').name.default), ['Foostraße', 'Foo Straße', 'Foostr']); + t.deepEqual(_.castArray(generate('Foostr.').name.default), ['Foostr.', 'Foo Straße', 'Foostraße']); + t.deepEqual(_.castArray(generate('Foostr').name.default), ['Foostr', 'Foo Straße', 'Foostraße']); + t.deepEqual(_.castArray(generate('Foo Strasse').name.default), ['Foo Strasse', 'Foostrasse', 'Foostr']); + t.deepEqual(_.castArray(generate('Foo Straße').name.default), ['Foo Straße', 'Foostraße', 'Foostr']); + + // note: these forms with the abbreviated generic are not handled within this script. + // I considered adding synonym substitution functionality but it's complex and better + // handled by https://github.com/pelias/openaddresses/pull/477 + // note: as a general rule, names at index-time should be provided un-abbreviated but may + // be in either abbreviated on un-abbreviated at search time. + // t.deepEqual(_.castArray(generate('Foo Str.').name.default), ['Foo Str.', 'Foostraße', 'Foostr']); + // t.deepEqual(_.castArray(generate('Foo Str').name.default), ['Foo Str', 'Foostraße', 'Foostr']); + + t.end(); + }); }; module.exports.all = function (tape, common) {