introduce and use analyzer for postcodes

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 7bc4720ef56ed82b4d8fe45f484ea8a386ade422..e9812ba0430338e6647d459a0c162eadad0d467c 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module.
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
-import re
  from textwrap import dedent
  
  from nominatim.db.connection import connect
  from textwrap import dedent
  
  from nominatim.db.connection import connect
@@ -51,7 +50,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """
          self.loader = ICURuleLoader(config)
  
          """
          self.loader = ICURuleLoader(config)
  
-        self._install_php(config.lib_dir.php)
+        self._install_php(config.lib_dir.php, overwrite=True)
          self._save_config()
  
          if init_db:
          self._save_config()
  
          if init_db:
@@ -67,6 +66,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
+        self._install_php(config.lib_dir.php, overwrite=False)
+
  
      def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
  
      def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
@@ -119,12 +120,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
              if not conn.table_exists('search_name'):
                  return
              with conn.cursor(name="hnr_counter") as cur:
              if not conn.table_exists('search_name'):
                  return
              with conn.cursor(name="hnr_counter") as cur:
-                cur.execute("""SELECT word_id, word_token FROM word
+                cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
+                               FROM word
                                 WHERE type = 'H'
                                   AND NOT EXISTS(SELECT * FROM search_name
                                                  WHERE ARRAY[word.word_id] && name_vector)
                                 WHERE type = 'H'
                                   AND NOT EXISTS(SELECT * FROM search_name
                                                  WHERE ARRAY[word.word_id] && name_vector)
-                                 AND (char_length(word_token) > 6
-                                      OR word_token not similar to '\\d+')
+                                 AND (char_length(coalesce(word, word_token)) > 6
+                                      OR coalesce(word, word_token) not similar to '\\d+')
                              """)
                  candidates = {token: wid for wid, token in cur}
              with conn.cursor(name="hnr_counter") as cur:
                              """)
                  candidates = {token: wid for wid, token in cur}
              with conn.cursor(name="hnr_counter") as cur:
@@ -137,6 +139,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                      for hnr in row[0].split(';'):
                          candidates.pop(hnr, None)
              LOG.info("There are %s outdated housenumbers.", len(candidates))
                      for hnr in row[0].split(';'):
                          candidates.pop(hnr, None)
              LOG.info("There are %s outdated housenumbers.", len(candidates))
+            LOG.debug("Outdated housenumbers: %s", candidates.keys())
              if candidates:
                  with conn.cursor() as cur:
                      cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
              if candidates:
                  with conn.cursor() as cur:
                      cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
@@ -172,16 +175,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                       self.loader.make_token_analysis())
  
  
                                       self.loader.make_token_analysis())
  
  
-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir, overwrite=True):
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent(f"""\
+                <?php
+                @define('CONST_Max_Word_Frequency', 10000000);
+                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
      def _save_config(self):
  
  
      def _save_config(self):
@@ -272,8 +277,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
-    @staticmethod
-    def normalize_postcode(postcode):
+    def normalize_postcode(self, postcode):
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
@@ -468,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          for item in address:
              if item.kind == 'postcode':
      def _process_place_address(self, token_info, address):
          for item in address:
              if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
              elif item.kind == 'housenumber':
                  token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
              elif item.kind == 'housenumber':
                  token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
@@ -477,7 +481,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  if not item.suffix:
                      token_info.add_place(self._compute_partial_tokens(item.name))
              elif not item.kind.startswith('_') and not item.suffix and \
                  if not item.suffix:
                      token_info.add_place(self._compute_partial_tokens(item.name))
              elif not item.kind.startswith('_') and not item.suffix and \
-                 item.kind not in ('country', 'full'):
+                 item.kind not in ('country', 'full', 'inclusion'):
                  token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
  
  
                  token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
  
  
@@ -588,7 +592,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                      continue
  
                  with self.conn.cursor() as cur:
                      continue
  
                  with self.conn.cursor() as cur:
-                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                    cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
                      full, part = cur.fetchone()
  
                                  (token_id, variants))
                      full, part = cur.fetchone()
  
@@ -600,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
          """ Make sure the normalized postcode is present in the word table.
          """
          """ Make sure the normalized postcode is present in the word table.
          """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        analyzer = self.token_analysis.get_analyzer('@postcode')
  
  
-            if postcode not in self._cache.postcodes:
-                term = self._search_normalized(postcode)
-                if not term:
-                    return
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.normalize(item.name)
+            variant_base = item.get_attr("variant")
  
  
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
+        if variant_base is not None:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
+
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return
+
+            variants = {term}
+            if analyzer is not None and variant_base is not None:
+                variants.update(analyzer.get_variants_ascii(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
  
  
  class _TokenInfo:
  
  
  class _TokenInfo:
@@ -632,6 +646,7 @@ class _TokenInfo:
          self.street_tokens = set()
          self.place_tokens = set()
          self.address_tokens = {}
          self.street_tokens = set()
          self.place_tokens = set()
          self.address_tokens = {}
+        self.postcode = None
  
  
      @staticmethod
  
  
      @staticmethod
@@ -696,6 +711,11 @@ class _TokenInfo:
          if partials:
              self.address_tokens[key] = self._mk_array(partials)
  
          if partials:
              self.address_tokens[key] = self._mk_array(partials)
  
+    def set_postcode(self, postcode):
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
  
  class _TokenCache:
      """ Cache for token information to avoid repeated database queries.
  
  class _TokenCache:
      """ Cache for token information to avoid repeated database queries.