remove redundant 'u' prefixes for unicode strings

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index f8f6af2ea04ad25381c8399b5a6ee48e5d9cdae3..9c7138ce67fa5174d0e947c72bf7a71313fe3435 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -1,3 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
@@ -45,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """
          self.loader = ICURuleLoader(config)
  
          """
          self.loader = ICURuleLoader(config)
  
-        self._install_php(config.lib_dir.php)
+        self._install_php(config.lib_dir.php, overwrite=True)
          self._save_config()
  
          if init_db:
          self._save_config()
  
          if init_db:
@@ -61,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
+        self._install_php(config.lib_dir.php, overwrite=False)
+
  
      def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
  
      def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
@@ -106,6 +114,49 @@ class LegacyICUTokenizer(AbstractTokenizer):
              conn.commit()
  
  
              conn.commit()
  
  
+    def _cleanup_housenumbers(self):
+        """ Remove unused house numbers.
+        """
+        with connect(self.dsn) as conn:
+            if not conn.table_exists('search_name'):
+                return
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
+                               FROM word
+                               WHERE type = 'H'
+                                 AND NOT EXISTS(SELECT * FROM search_name
+                                                WHERE ARRAY[word.word_id] && name_vector)
+                                 AND (char_length(coalesce(word, word_token)) > 6
+                                      OR coalesce(word, word_token) not similar to '\\d+')
+                            """)
+                candidates = {token: wid for wid, token in cur}
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT housenumber FROM placex
+                               WHERE housenumber is not null
+                                     AND (char_length(housenumber) > 6
+                                          OR housenumber not similar to '\\d+')
+                            """)
+                for row in cur:
+                    for hnr in row[0].split(';'):
+                        candidates.pop(hnr, None)
+            LOG.info("There are %s outdated housenumbers.", len(candidates))
+            LOG.debug("Outdated housenumbers: %s", candidates.keys())
+            if candidates:
+                with conn.cursor() as cur:
+                    cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
+                                (list(candidates.values()), ))
+                conn.commit()
+
+
+
+    def update_word_tokens(self):
+        """ Remove unused tokens.
+        """
+        LOG.warning("Cleaning up housenumber tokens.")
+        self._cleanup_housenumbers()
+        LOG.warning("Tokenizer house-keeping done.")
+
+
      def name_analyzer(self):
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
      def name_analyzer(self):
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
@@ -125,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                       self.loader.make_token_analysis())
  
  
                                       self.loader.make_token_analysis())
  
  
-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir, overwrite=True):
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent(f"""\
+                <?php
+                @define('CONST_Max_Word_Frequency', 10000000);
+                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
      def _save_config(self):
  
  
      def _save_config(self):
@@ -235,13 +288,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return postcode.strip().upper()
  
  
          return postcode.strip().upper()
  
  
-    def _make_standard_hnr(self, hnr):
-        """ Create a normalised version of a housenumber.
-
-            This function takes minor shortcuts on transliteration.
-        """
-        return self._search_normalized(hnr)
-
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
              table.
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
              table.
@@ -343,17 +389,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
  
      def add_country_names(self, country_code, names):
  
  
      def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+        """ Add default names for the given country to the search index.
          """
          # Make sure any name preprocessing for country names applies.
          info = PlaceInfo({'name': names, 'country_code': country_code,
                            'rank_address': 4, 'class': 'boundary',
                            'type': 'administrative'})
          self._add_country_full_names(country_code,
          """
          # Make sure any name preprocessing for country names applies.
          info = PlaceInfo({'name': names, 'country_code': country_code,
                            'rank_address': 4, 'class': 'boundary',
                            'type': 'administrative'})
          self._add_country_full_names(country_code,
-                                     self.sanitizer.process_names(info)[0])
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)
  
  
  
  
-    def _add_country_full_names(self, country_code, names):
+    def _add_country_full_names(self, country_code, names, internal=False):
          """ Add names for the given country from an already sanitized
              name list.
          """
          """ Add names for the given country from an already sanitized
              name list.
          """
@@ -365,21 +412,41 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          with self.conn.cursor() as cur:
              # Get existing names
  
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and word = %s""",
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
                          (country_code, ))
                          (country_code, ))
-            word_tokens.difference_update((t[0] for t in cur))
+            existing_tokens = {True: set(), False: set()} # internal/external names
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])
+
+            # Delete names that no longer exist.
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
+            if gone_tokens:
+                cur.execute("""DELETE FROM word
+                               USING unnest(%s) as token
+                               WHERE type = 'C' and word = %s
+                                     and word_token = token""",
+                            (list(gone_tokens), country_code))
  
              # Only add those names that are not yet in the list.
  
              # Only add those names that are not yet in the list.
-            if word_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, word)
-                               (SELECT token, 'C', %s
-                                FROM unnest(%s) as token)
-                            """, (country_code, list(word_tokens)))
-
-            # No names are deleted at the moment.
-            # If deletion is made possible, then the static names from the
-            # initial 'country_name' table should be kept.
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
  
  
      def process_place(self, place):
  
  
      def process_place(self, place):
@@ -388,14 +455,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
              Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
-        token_info = _TokenInfo(self._cache)
+        token_info = _TokenInfo()
  
          names, address = self.sanitizer.process_names(place)
  
          if names:
  
          names, address = self.sanitizer.process_names(place)
  
          if names:
-            fulls, partials = self._compute_name_tokens(names)
-
-            token_info.add_names(fulls, partials)
+            token_info.set_names(*self._compute_name_tokens(names))
  
              if place.is_country():
                  self._add_country_full_names(place.country_code, names)
  
              if place.is_country():
                  self._add_country_full_names(place.country_code, names)
@@ -403,37 +468,59 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)
  
-        return token_info.data
+        return token_info.to_dict()
  
  
      def _process_place_address(self, token_info, address):
  
  
      def _process_place_address(self, token_info, address):
-        hnrs = []
-        addr_terms = []
-        streets = []
          for item in address:
              if item.kind == 'postcode':
                  self._add_postcode(item.name)
          for item in address:
              if item.kind == 'postcode':
                  self._add_postcode(item.name)
-            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(item.name)
+            elif item.kind == 'housenumber':
+                token_info.add_housenumber(*self._compute_housenumber_token(item))
              elif item.kind == 'street':
              elif item.kind == 'street':
-                token = self._retrieve_full_token(item.name)
-                if token:
-                    streets.append(token)
+                token_info.add_street(self._retrieve_full_tokens(item.name))
              elif item.kind == 'place':
              elif item.kind == 'place':
-                token_info.add_place(self._compute_partial_tokens(item.name))
-            elif not item.kind.startswith('_') and \
+                if not item.suffix:
+                    token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and not item.suffix and \
                   item.kind not in ('country', 'full'):
                   item.kind not in ('country', 'full'):
-                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
+                token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
  
  
-        if hnrs:
-            hnrs = self._split_housenumbers(hnrs)
-            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
  
  
-        if addr_terms:
-            token_info.add_address_terms(addr_terms)
+    def _compute_housenumber_token(self, hnr):
+        """ Normalize the housenumber and return the word token and the
+            canonical form.
+        """
+        analyzer = self.token_analysis.analysis.get('@housenumber')
+        result = None, None
+
+        if analyzer is None:
+            # When no custom analyzer is set, simply normalize and transliterate
+            norm_name = self._search_normalized(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    with self.conn.cursor() as cur:
+                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+                        result = cur.fetchone()[0], norm_name
+                        self._cache.housenumbers[norm_name] = result
+        else:
+            # Otherwise use the analyzer to determine the canonical name.
+            # Per convention we use the first variant as the 'lookup name', the
+            # name that gets saved in the housenumber field of the place.
+            norm_name = analyzer.normalize(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    variants = analyzer.get_variants_ascii(norm_name)
+                    if variants:
+                        with self.conn.cursor() as cur:
+                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
+                                        (norm_name, list(variants)))
+                            result = cur.fetchone()[0], variants[0]
+                            self._cache.housenumbers[norm_name] = result
  
  
-        if streets:
-            token_info.add_street(streets)
+        return result
  
  
      def _compute_partial_tokens(self, name):
  
  
      def _compute_partial_tokens(self, name):
@@ -464,25 +551,20 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return tokens
  
  
          return tokens
  
  
-    def _retrieve_full_token(self, name):
+    def _retrieve_full_tokens(self, name):
          """ Get the full name token for the given name, if it exists.
              The name is only retrived for the standard analyser.
          """
          """ Get the full name token for the given name, if it exists.
              The name is only retrived for the standard analyser.
          """
-        norm_name = self._normalized(name)
+        norm_name = self._search_normalized(name)
  
          # return cached if possible
          if norm_name in self._cache.fulls:
              return self._cache.fulls[norm_name]
  
  
          # return cached if possible
          if norm_name in self._cache.fulls:
              return self._cache.fulls[norm_name]
  
-        # otherwise compute
-        full, _ = self._cache.names.get(norm_name, (None, None))
-
-        if full is None:
-            with self.conn.cursor() as cur:
-                cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1",
-                            (norm_name, ))
-                if cur.rowcount > 0:
-                    full = cur.fetchone()[0]
+        with self.conn.cursor() as cur:
+            cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
+                        (norm_name, ))
+            full = [row[0] for row in cur]
  
          self._cache.fulls[norm_name] = full
  
  
          self._cache.fulls[norm_name] = full
  
@@ -498,7 +580,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          for name in names:
              analyzer_id = name.get_attr('analyzer')
  
          for name in names:
              analyzer_id = name.get_attr('analyzer')
-            norm_name = self._normalized(name.name)
+            analyzer = self.token_analysis.get_analyzer(analyzer_id)
+            norm_name = analyzer.normalize(name.name)
              if analyzer_id is None:
                  token_id = norm_name
              else:
              if analyzer_id is None:
                  token_id = norm_name
              else:
@@ -506,12 +589,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
-                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
+                variants = analyzer.get_variants_ascii(norm_name)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
-                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                    cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
                      full, part = cur.fetchone()
  
                                  (token_id, variants))
                      full, part = cur.fetchone()
  
@@ -545,71 +628,79 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  self._cache.postcodes.add(postcode)
  
  
                  self._cache.postcodes.add(postcode)
  
  
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self):
+        self.names = None
+        self.housenumbers = set()
+        self.housenumber_tokens = set()
+        self.street_tokens = set()
+        self.place_tokens = set()
+        self.address_tokens = {}
+
+
      @staticmethod
      @staticmethod
-    def _split_housenumbers(hnrs):
-        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
-            # split numbers if necessary
-            simple_list = []
-            for hnr in hnrs:
-                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
-            if len(simple_list) > 1:
-                hnrs = list(set(simple_list))
-            else:
-                hnrs = simple_list
+    def _mk_array(tokens):
+        return f"{{{','.join((str(s) for s in tokens))}}}"
  
  
-        return hnrs
  
  
+    def to_dict(self):
+        """ Return the token information in database importable format.
+        """
+        out = {}
  
  
+        if self.names:
+            out['names'] = self.names
  
  
+        if self.housenumbers:
+            out['hnr'] = ';'.join(self.housenumbers)
+            out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
  
  
-class _TokenInfo:
-    """ Collect token information to be sent back to the database.
-    """
-    def __init__(self, cache):
-        self._cache = cache
-        self.data = {}
+        if self.street_tokens:
+            out['street'] = self._mk_array(self.street_tokens)
  
  
-    @staticmethod
-    def _mk_array(tokens):
-        return '{%s}' % ','.join((str(s) for s in tokens))
+        if self.place_tokens:
+            out['place'] = self._mk_array(self.place_tokens)
+
+        if self.address_tokens:
+            out['addr'] = self.address_tokens
  
  
+        return out
  
  
-    def add_names(self, fulls, partials):
+
+    def set_names(self, fulls, partials):
          """ Adds token information for the normalised names.
          """
          """ Adds token information for the normalised names.
          """
-        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
+        self.names = self._mk_array(itertools.chain(fulls, partials))
  
  
  
  
-    def add_housenumbers(self, conn, hnrs):
+    def add_housenumber(self, token, hnr):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
-        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
-        self.data['hnr'] = ';'.join(hnrs)
+        if token:
+            self.housenumbers.add(hnr)
+            self.housenumber_tokens.add(token)
  
  
      def add_street(self, tokens):
          """ Add addr:street match terms.
          """
  
  
      def add_street(self, tokens):
          """ Add addr:street match terms.
          """
-        self.data['street'] = self._mk_array(tokens)
+        self.street_tokens.update(tokens)
  
  
      def add_place(self, tokens):
          """ Add addr:place search and match terms.
          """
  
  
      def add_place(self, tokens):
          """ Add addr:place search and match terms.
          """
-        if tokens:
-            self.data['place'] = self._mk_array(tokens)
+        self.place_tokens.update(tokens)
  
  
  
  
-    def add_address_terms(self, terms):
+    def add_address_term(self, key, partials):
          """ Add additional address terms.
          """
          """ Add additional address terms.
          """
-        tokens = {key: self._mk_array(partials)
-                  for key, partials in terms if partials}
-
-        if tokens:
-            self.data['addr'] = tokens
+        if partials:
+            self.address_tokens[key] = self._mk_array(partials)
  
  
  class _TokenCache:
  
  
  class _TokenCache:
@@ -624,29 +715,3 @@ class _TokenCache:
          self.fulls = {}
          self.postcodes = set()
          self.housenumbers = {}
          self.fulls = {}
          self.postcodes = set()
          self.housenumbers = {}
-
-
-    def get_hnr_tokens(self, conn, terms):
-        """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary. `terms` is an iterable of normalized
-            housenumbers.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.housenumbers.get(term)
-            if token is None:
-                askdb.append(term)
-            else:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.housenumbers[term] = tid
-                    tokens.append(tid)
-
-        return tokens