From: Sarah Hoffmann Date: Tue, 10 Apr 2018 20:48:17 +0000 (+0200) Subject: ignore Unicode format characters for normalization X-Git-Tag: v3.2.0~93^2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/ae83ceab5ef97988cf9ea375ae4cf7afd1c05110?ds=sidebyside ignore Unicode format characters for normalization Also adds tests. Fixes #1007. --- diff --git a/settings/defaults.php b/settings/defaults.php index 81c19c74..2d8f47d0 100644 --- a/settings/defaults.php +++ b/settings/defaults.php @@ -20,7 +20,7 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true); // Rules for normalizing terms for comparison before doing comparisons. // The default is to remove accents and punctuation and to lower-case the // term. Spaces are kept but collapsed to one standard space. -@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); +@define('CONST_Term_Normalization_Rules', ":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); // Set to false to avoid importing extra postcodes for the US. @define('CONST_Use_Extra_US_Postcodes', true); diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 4c49dc5b..7fde34d3 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -95,7 +95,6 @@ Feature: Import of postcodes | object | postcode | | W93 | 445023 | - @wip Scenario: Postcodes from admin boundaries are preferred over estimated postcodes Given the scene admin-areas And the named places diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index 1ef1fcbe..32052647 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -136,3 +136,13 @@ Feature: Import and search of names Then results contain | ID | osm_type | osm_id | | 0 | R | 1 | + + Scenario: Unprintable characters in postcodes are ignored + Given the named places + | osm | class | type | address | + | N234 | amenity | prison | 'postcode' : u'1234\u200e' | + When importing + And searching for "1234" + Then results contain + | ID | osm_type | + | 0 | P | diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature index 7db8d629..e2594343 100644 --- a/test/bdd/osm2pgsql/import/tags.feature +++ b/test/bdd/osm2pgsql/import/tags.feature @@ -96,6 +96,15 @@ Feature: Tag evaluation | N3 | 'name: de' : 'Foo', 'name:\\\\' : 'real3' | | N4 | 'name: de' : 'Foo', 'name' : 'rea\\l3' | + Scenario: Unprintable character in address tag are maintained + When loading osm data + """ + n23 Tamenity=yes,name=foo,addr:postcode=1234%200e% + """ + Then place contains + | object | address | + | N23 | 'postcode' : u'1234\u200e' | + Scenario Outline: Included places When loading osm data """ diff --git a/test/bdd/steps/db_ops.py b/test/bdd/steps/db_ops.py index be2211fa..87babdad 100644 --- a/test/bdd/steps/db_ops.py +++ b/test/bdd/steps/db_ops.py @@ -22,6 +22,8 @@ class PlaceColumn: self.add_hstore('extratags', key[6:], value) elif key.startswith('addr+'): self.add_hstore('address', key[5:], value) + elif key in ('name', 'address', 'extratags'): + self.columns[key] = eval('{' + value + '}') else: assert_in(key, ('class', 'type')) self.columns[key] = None if value == '' else value