Vagrant and CI tests for Ubuntu 22.04

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index b553dbc641d708175e8f7281f05cf14cf4673484..171d43926cc42be9695ef11522acd7ecf1ed07c8 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -11,13 +11,12 @@ libICU instead of the PostgreSQL module.
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
-import re
  from textwrap import dedent
  
  from nominatim.db.connection import connect
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from textwrap import dedent
  
  from nominatim.db.connection import connect
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
-from nominatim.indexer.place_info import PlaceInfo
+from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
@@ -187,7 +186,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                  @define('CONST_Max_Word_Frequency', 10000000);
                  @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                  @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
                  @define('CONST_Max_Word_Frequency', 10000000);
                  @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                  @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
      def _save_config(self):
  
  
      def _save_config(self):
@@ -278,8 +277,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
-    @staticmethod
-    def normalize_postcode(postcode):
+    def normalize_postcode(self, postcode):
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
@@ -292,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
-        to_delete = []
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
          with self.conn.cursor() as cur:
          with self.conn.cursor() as cur:
-            # This finds us the rows in location_postcode and word that are
-            # missing in the other table.
-            cur.execute("""SELECT * FROM
-                            (SELECT pc, word FROM
-                              (SELECT distinct(postcode) as pc FROM location_postcode) p
-                              FULL JOIN
-                              (SELECT word FROM word WHERE type = 'P') w
-                              ON pc = word) x
-                           WHERE pc is null or word is null""")
-
-            with CopyBuffer() as copystr:
-                for postcode, word in cur:
-                    if postcode is None:
-                        to_delete.append(word)
-                    else:
-                        copystr.add(self._search_normalized(postcode),
-                                    'P', postcode)
-
-                if to_delete:
-                    cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and word = any(%s)
-                                """, (to_delete, ))
-
-                copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'word'])
+            # First get all postcode names currently in the word table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+            word_entries = set((entry[0] for entry in cur))
+
+            # Then compute the required postcode names from the postcode table.
+            needed_entries = set()
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
+            for cc, postcode in cur:
+                info = PlaceInfo({'country_code': cc,
+                                  'class': 'place', 'type': 'postcode',
+                                  'address': {'postcode': postcode}})
+                address = self.sanitizer.process_names(info)[1]
+                for place in address:
+                    if place.kind == 'postcode':
+                        if analyzer is None:
+                            postcode_name = place.name.strip().upper()
+                            variant_base = None
+                        else:
+                            postcode_name = analyzer.normalize(place.name)
+                            variant_base = place.get_attr("variant")
+
+                        if variant_base:
+                            needed_entries.add(f'{postcode_name}@{variant_base}')
+                        else:
+                            needed_entries.add(postcode_name)
+                        break
+
+        # Now update the word table.
+        self._delete_unused_postcode_words(word_entries - needed_entries)
+        self._add_missing_postcode_words(needed_entries - word_entries)
+
+    def _delete_unused_postcode_words(self, tokens):
+        if tokens:
+            with self.conn.cursor() as cur:
+                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+                            (list(tokens), ))
+
+    def _add_missing_postcode_words(self, tokens):
+        if not tokens:
+            return
+
+        analyzer = self.token_analysis.analysis.get('@postcode')
+        terms = []
+
+        for postcode_name in tokens:
+            if '@' in postcode_name:
+                term, variant = postcode_name.split('@', 2)
+                term = self._search_normalized(term)
+                variants = {term}
+                if analyzer is not None:
+                    variants.update(analyzer.get_variants_ascii(variant))
+                    variants = list(variants)
+            else:
+                variants = [self._search_normalized(postcode_name)]
+            terms.append((postcode_name, variants))
+
+        if terms:
+            with self.conn.cursor() as cur:
+                cur.execute_values("""SELECT create_postcode_word(pc, var)
+                                      FROM (VALUES %s) AS v(pc, var)""",
+                                   terms)
+
+
  
  
      def update_special_phrases(self, phrases, should_replace):
  
  
      def update_special_phrases(self, phrases, should_replace):
@@ -474,7 +511,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          for item in address:
              if item.kind == 'postcode':
      def _process_place_address(self, token_info, address):
          for item in address:
              if item.kind == 'postcode':
-                self._add_postcode(item.name)
+                token_info.set_postcode(self._add_postcode(item))
              elif item.kind == 'housenumber':
                  token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
              elif item.kind == 'housenumber':
                  token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
@@ -483,7 +520,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  if not item.suffix:
                      token_info.add_place(self._compute_partial_tokens(item.name))
              elif not item.kind.startswith('_') and not item.suffix and \
                  if not item.suffix:
                      token_info.add_place(self._compute_partial_tokens(item.name))
              elif not item.kind.startswith('_') and not item.suffix and \
-                 item.kind not in ('country', 'full'):
+                 item.kind not in ('country', 'full', 'inclusion'):
                  token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
  
  
                  token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
  
  
@@ -606,26 +643,38 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item):
          """ Make sure the normalized postcode is present in the word table.
          """
          """ Make sure the normalized postcode is present in the word table.
          """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.normalize(item.name)
+            variant_base = item.get_attr("variant")
  
  
-            if postcode not in self._cache.postcodes:
-                term = self._search_normalized(postcode)
-                if not term:
-                    return
+        if variant_base:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
  
  
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return None
+
+            variants = {term}
+            if analyzer is not None and variant_base:
+                variants.update(analyzer.get_variants_ascii(variant_base))
+
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
+
+        return postcode_name
  
  
  class _TokenInfo:
  
  
  class _TokenInfo:
@@ -638,6 +687,7 @@ class _TokenInfo:
          self.street_tokens = set()
          self.place_tokens = set()
          self.address_tokens = {}
          self.street_tokens = set()
          self.place_tokens = set()
          self.address_tokens = {}
+        self.postcode = None
  
  
      @staticmethod
  
  
      @staticmethod
@@ -666,6 +716,9 @@ class _TokenInfo:
          if self.address_tokens:
              out['addr'] = self.address_tokens
  
          if self.address_tokens:
              out['addr'] = self.address_tokens
  
+        if self.postcode:
+            out['postcode'] = self.postcode
+
          return out
  
  
          return out
  
  
@@ -702,6 +755,11 @@ class _TokenInfo:
          if partials:
              self.address_tokens[key] = self._mk_array(partials)
  
          if partials:
              self.address_tokens[key] = self._mk_array(partials)
  
+    def set_postcode(self, postcode):
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
+
  
  class _TokenCache:
      """ Cache for token information to avoid repeated database queries.
  
  class _TokenCache:
      """ Cache for token information to avoid repeated database queries.