X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f03a05f6bb33918712691657a2179c96da26e93b..815e3cccf6840d935e5a708588700cca23b976e9:/nominatim/tokenizer/token_analysis/housenumbers.py diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py index 6a838e00..96e86b28 100644 --- a/nominatim/tokenizer/token_analysis/housenumbers.py +++ b/nominatim/tokenizer/token_analysis/housenumbers.py @@ -15,17 +15,18 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG RE_NON_DIGIT = re.compile('[^0-9]') RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])') RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)') +RE_NAMED_PART = re.compile(r'[a-z]{4}') ### Configuration section -def configure(rules, normalization_rules): +def configure(rules, normalization_rules): # pylint: disable=W0613 """ All behaviour is currently hard-coded. """ return None ### Analysis section -def create(normalizer, transliterator, config): +def create(normalizer, transliterator, config): # pylint: disable=W0613 """ Create a new token analysis instance for this module. """ return HousenumberTokenAnalysis(normalizer, transliterator) @@ -48,8 +49,14 @@ class HousenumberTokenAnalysis: return name norm = self.trans.transliterate(self.norm.transliterate(name)) - norm = RE_DIGIT_ALPHA.sub(r'\1␣\2', norm) - norm = RE_ALPHA_DIGIT.sub(r'\1␣\2', norm) + # If there is a significant non-numeric part, use as is. + if RE_NAMED_PART.search(norm) is None: + # Otherwise add optional spaces between digits and letters. + (norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm) + (norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt) + # Avoid creating too many variants per number. + if cnt1 + cnt2 <= 4: + return norm_opt return norm