]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge pull request #2786 from lonvia/export-centroid-for-tokenizer
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 1e3eab98a3ff202d65d97e22b690e4315c0bac3b..319838a16849b7bc9d1bdae31b27dab07594eb5a 100644 (file)
@@ -8,7 +8,8 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
-from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+                   Dict, Set, Iterable
 import itertools
 import json
 import logging
 import itertools
 import json
 import logging
@@ -22,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -37,7 +38,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
 
 
 class ICUTokenizer(AbstractTokenizer):
 
 
 class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
@@ -323,7 +324,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
@@ -358,7 +359,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 if analyzer is None:
                     variants = [term]
                 else:
                 if analyzer is None:
                     variants = [term]
                 else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                     if term not in variants:
                         variants.append(term)
             else:
                     if term not in variants:
                         variants.append(term)
             else:
@@ -374,7 +375,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
 
 
 
 
 
-    def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                should_replace: bool) -> None:
         """ Replace the search index for special phrases with the new phrases.
             If `should_replace` is True, then the previous set of will be
                                should_replace: bool) -> None:
         """ Replace the search index for special phrases with the new phrases.
             If `should_replace` is True, then the previous set of will be
@@ -430,7 +431,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
@@ -572,17 +573,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                 if result[0] is None:
                 if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
                     if variants:
                         with self.conn.cursor() as cur:
                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
+                                        (word_id, list(variants)))
                             result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
                             result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            self._cache.housenumbers[word_id] = result
 
         return result
 
 
         return result
 
@@ -619,7 +620,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
@@ -649,15 +650,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
             if analyzer_id is None:
             if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
             else:
             else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                 if not variants:
                     continue
 
                 if not variants:
                     continue
 
@@ -687,7 +688,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
             variant_base = item.get_attr("variant")
 
         if variant_base:
             variant_base = item.get_attr("variant")
 
         if variant_base:
@@ -702,7 +703,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
             variants = {term}
             if analyzer is not None and variant_base:
 
             variants = {term}
             if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",