END;
$$
LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
+ RETURNS BOOLEAN
+ AS $$
+DECLARE
+ existing INTEGER;
+BEGIN
+ SELECT count(*) INTO existing
+ FROM word WHERE word = postcode and type = 'P';
+
+ IF existing > 0 THEN
+ RETURN TRUE;
+ END IF;
+
+ -- postcodes don't need word ids
+ INSERT INTO word (word_token, type, word)
+ SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
+
+ RETURN FALSE;
+END;
+$$
+LANGUAGE plpgsql;
+
import itertools
import json
import logging
-import re
from textwrap import dedent
from nominatim.db.connection import connect
def _process_place_address(self, token_info, address):
for item in address:
if item.kind == 'postcode':
- self._add_postcode(item.name)
+ token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
return full_tokens, partial_tokens
- def _add_postcode(self, postcode):
+ def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table.
"""
- if re.search(r'[:,;]', postcode) is None:
- postcode = self.normalize_postcode(postcode)
+ analyzer = self.token_analysis.get_analyzer('@postcode')
- if postcode not in self._cache.postcodes:
- term = self._search_normalized(postcode)
- if not term:
- return
+ if analyzer is None:
+ postcode_name = item.name.strip().upper()
+ variant_base = None
+ else:
+ postcode_name = analyzer.normalize(item.name)
+ variant_base = item.get_attr("variant")
- with self.conn.cursor() as cur:
- # no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word_token, type, word)
- (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
- WHERE NOT EXISTS
- (SELECT * FROM word
- WHERE type = 'P' and word = pc))
- """, (term, postcode))
- self._cache.postcodes.add(postcode)
+ if variant_base is not None:
+ postcode = f'{postcode_name}@{variant_base}'
+ else:
+ postcode = postcode_name
+
+ if postcode not in self._cache.postcodes:
+ term = self._search_normalized(postcode_name)
+ if not term:
+ return
+
+ variants = {term}
+ if analyzer is not None and variant_base is not None:
+ variants.update(analyzer.get_variants_ascii(variant_base))
+
+ with self.conn.cursor() as cur:
+ cur.execute("SELECT create_postcode_word(%s, %s)",
+ (postcode, list(variants)))
+ self._cache.postcodes.add(postcode)
class _TokenInfo:
self.street_tokens = set()
self.place_tokens = set()
self.address_tokens = {}
+ self.postcode = None
@staticmethod
if partials:
self.address_tokens[key] = self._mk_array(partials)
+ def set_postcode(self, postcode):
+ """ Set the postcode to the given one.
+ """
+ self.postcode = postcode
+
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
obj.address.pop(pos)
else:
postcode.name = formatted[0]
- postcode.set_attr('lookup', formatted[1])
+ postcode.set_attr('variant', formatted[1])
def scan(self, postcode, country):
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+
+from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules, normalization_rules): # pylint: disable=W0613
+ """ All behaviour is currently hard-coded.
+ """
+ return None
+
+### Analysis section
+
+def create(normalizer, transliterator, config): # pylint: disable=W0613
+ """ Create a new token analysis instance for this module.
+ """
+ return PostcodeTokenAnalysis(normalizer, transliterator)
+
+class PostcodeTokenAnalysis:
+ """ Detects common housenumber patterns and normalizes them.
+ """
+ def __init__(self, norm, trans):
+ self.norm = norm
+ self.trans = trans
+
+ self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+ def normalize(self, name):
+ """ Return the standard form of the postcode.
+ """
+ return name.strip().upper()
+
+
+ def get_variants_ascii(self, norm_name):
+ """ Compute the spelling variants for the given normalized postcode.
+
+ The official form creates one variant. If a 'lookup version' is
+ given, then it will create variants with optional spaces.
+ """
+ # Postcodes follow their own transliteration rules.
+ # Make sure at this point, that the terms are normalized in a way
+ # that they are searchable with the standard transliteration rules.
+ return [self.trans.transliterate(term) for term in
+ self.mutator.generate([self.norm.transliterate(norm_name)])]
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
- step: clean-postcodes
convert-to-address: yes
- default-pattern: [A-Z0-9- ]{3,12}
+ default-pattern: "[A-Z0-9- ]{3,12}"
- step: split-name-list
- step: strip-brace-terms
- step: tag-analyzer-by-language
- analyzer: generic
- id: "@housenumber"
analyzer: housenumbers
+ - id: "@postcode"
+ analyzer: postcodes
- id: bg
analyzer: generic
mode: variant-only