Merge pull request #2786 from lonvia/export-centroid-for-tokenizer

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 1e3eab98a3ff202d65d97e22b690e4315c0bac3b..319838a16849b7bc9d1bdae31b27dab07594eb5a 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -8,7 +8,8 @@
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+                   Dict, Set, Iterable
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
@@ -22,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
@@ -37,7 +38,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  
  
  class ICUTokenizer(AbstractTokenizer):
  
  
  class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
@@ -323,7 +324,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                              postcode_name = place.name.strip().upper()
                              variant_base = None
                          else:
                              postcode_name = place.name.strip().upper()
                              variant_base = None
                          else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                              variant_base = place.get_attr("variant")
  
                          if variant_base:
                              variant_base = place.get_attr("variant")
  
                          if variant_base:
@@ -358,7 +359,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                  if analyzer is None:
                      variants = [term]
                  else:
                  if analyzer is None:
                      variants = [term]
                  else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                      if term not in variants:
                          variants.append(term)
              else:
                      if term not in variants:
                          variants.append(term)
              else:
@@ -374,7 +375,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
  
  
  
  
  
-    def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                 should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
              If `should_replace` is True, then the previous set of will be
                                 should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
              If `should_replace` is True, then the previous set of will be
@@ -430,7 +431,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
      def _remove_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
      def _remove_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
@@ -572,17 +573,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              # Otherwise use the analyzer to determine the canonical name.
              # Per convention we use the first variant as the 'lookup name', the
              # name that gets saved in the housenumber field of the place.
              # Otherwise use the analyzer to determine the canonical name.
              # Per convention we use the first variant as the 'lookup name', the
              # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                  if result[0] is None:
                  if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                      if variants:
                          with self.conn.cursor() as cur:
                              cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
                      if variants:
                          with self.conn.cursor() as cur:
                              cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
+                                        (word_id, list(variants)))
                              result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
                              result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            self._cache.housenumbers[word_id] = result
  
          return result
  
  
          return result
  
@@ -619,7 +620,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
  
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
          """
          assert self.conn is not None
          norm_name = self._search_normalized(name)
          """
          assert self.conn is not None
          norm_name = self._search_normalized(name)
@@ -649,15 +650,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
          for name in names:
              analyzer_id = name.get_attr('analyzer')
              analyzer = self.token_analysis.get_analyzer(analyzer_id)
          for name in names:
              analyzer_id = name.get_attr('analyzer')
              analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
              if analyzer_id is None:
              if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
              else:
              else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                  if not variants:
                      continue
  
                  if not variants:
                      continue
  
@@ -687,7 +688,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              postcode_name = item.name.strip().upper()
              variant_base = None
          else:
              postcode_name = item.name.strip().upper()
              variant_base = None
          else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
              variant_base = item.get_attr("variant")
  
          if variant_base:
              variant_base = item.get_attr("variant")
  
          if variant_base:
@@ -702,7 +703,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
              variants = {term}
              if analyzer is not None and variant_base:
  
              variants = {term}
              if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
  
              with self.conn.cursor() as cur:
                  cur.execute("SELECT create_postcode_word(%s, %s)",
  
              with self.conn.cursor() as cur:
                  cur.execute("SELECT create_postcode_word(%s, %s)",