Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 31eaaf2958aef1411a8228462fee04d68507555b..79f383f6f1c11b2e8aac46b09fe1afb75e26a981 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -23,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
@@ -38,7 +38,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  
  
  class ICUTokenizer(AbstractTokenizer):
  
  
  class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
@@ -324,7 +324,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                              postcode_name = place.name.strip().upper()
                              variant_base = None
                          else:
                              postcode_name = place.name.strip().upper()
                              variant_base = None
                          else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                              variant_base = place.get_attr("variant")
  
                          if variant_base:
                              variant_base = place.get_attr("variant")
  
                          if variant_base:
@@ -359,7 +359,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                  if analyzer is None:
                      variants = [term]
                  else:
                  if analyzer is None:
                      variants = [term]
                  else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                      if term not in variants:
                          variants.append(term)
              else:
                      if term not in variants:
                          variants.append(term)
              else:
@@ -431,7 +431,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
      def _remove_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
      def _remove_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
@@ -566,24 +566,25 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                  result = self._cache.housenumbers.get(norm_name, result)
                  if result[0] is None:
                      with self.conn.cursor() as cur:
                  result = self._cache.housenumbers.get(norm_name, result)
                  if result[0] is None:
                      with self.conn.cursor() as cur:
-                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                        result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
                          self._cache.housenumbers[norm_name] = result
          else:
              # Otherwise use the analyzer to determine the canonical name.
              # Per convention we use the first variant as the 'lookup name', the
              # name that gets saved in the housenumber field of the place.
                          self._cache.housenumbers[norm_name] = result
          else:
              # Otherwise use the analyzer to determine the canonical name.
              # Per convention we use the first variant as the 'lookup name', the
              # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                  if result[0] is None:
                  if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                      if variants:
                          with self.conn.cursor() as cur:
                      if variants:
                          with self.conn.cursor() as cur:
-                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
-                            result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
+                            self._cache.housenumbers[word_id] = result
  
          return result
  
  
          return result
  
@@ -620,7 +621,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
  
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
          """
          assert self.conn is not None
          norm_name = self._search_normalized(name)
          """
          assert self.conn is not None
          norm_name = self._search_normalized(name)
@@ -650,23 +651,22 @@ class ICUNameAnalyzer(AbstractAnalyzer):
          for name in names:
              analyzer_id = name.get_attr('analyzer')
              analyzer = self.token_analysis.get_analyzer(analyzer_id)
          for name in names:
              analyzer_id = name.get_attr('analyzer')
              analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
              if analyzer_id is None:
              if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
              else:
              else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
-                    full, part = cast(Tuple[int, List[int]],
-                                      cur.fetchone()) # type: ignore[no-untyped-call]
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
  
                  self._cache.names[token_id] = (full, part)
  
  
                  self._cache.names[token_id] = (full, part)
  
@@ -688,7 +688,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              postcode_name = item.name.strip().upper()
              variant_base = None
          else:
              postcode_name = item.name.strip().upper()
              variant_base = None
          else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
              variant_base = item.get_attr("variant")
  
          if variant_base:
              variant_base = item.get_attr("variant")
  
          if variant_base:
@@ -703,7 +703,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
              variants = {term}
              if analyzer is not None and variant_base:
  
              variants = {term}
              if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
  
              with self.conn.cursor() as cur:
                  cur.execute("SELECT create_postcode_word(%s, %s)",
  
              with self.conn.cursor() as cur:
                  cur.execute("SELECT create_postcode_word(%s, %s)",