X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f9b56a8581be3663239d23ee1194891dde9a3857..86588419fb1c3fffe131c0e8d99ecea3c77d67c5:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 90caec1c..cfbb44e3 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -1,3 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. @@ -407,14 +413,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): - hnrs = [] + hnrs = set() addr_terms = [] streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) - elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(item.name) + elif item.kind == 'housenumber': + norm_name = self._make_standard_hnr(item.name) + if norm_name: + hnrs.add(norm_name) elif item.kind == 'street': streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': @@ -425,8 +433,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + token_info.add_housenumbers(self.conn, hnrs) if addr_terms: token_info.add_address_terms(addr_terms) @@ -539,24 +546,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self._cache.postcodes.add(postcode) - @staticmethod - def _split_housenumbers(hnrs): - if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: - # split numbers if necessary - simple_list = [] - for hnr in hnrs: - simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) - - if len(simple_list) > 1: - hnrs = list(set(simple_list)) - else: - hnrs = simple_list - - return hnrs - - - - class _TokenInfo: """ Collect token information to be sent back to the database. """