]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge pull request #2786 from lonvia/export-centroid-for-tokenizer
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 31eaaf2958aef1411a8228462fee04d68507555b..319838a16849b7bc9d1bdae31b27dab07594eb5a 100644 (file)
@@ -23,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -38,7 +38,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
 
 
 class ICUTokenizer(AbstractTokenizer):
 
 
 class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
@@ -324,7 +324,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
@@ -359,7 +359,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 if analyzer is None:
                     variants = [term]
                 else:
                 if analyzer is None:
                     variants = [term]
                 else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                     if term not in variants:
                         variants.append(term)
             else:
                     if term not in variants:
                         variants.append(term)
             else:
@@ -431,7 +431,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
@@ -573,17 +573,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                 if result[0] is None:
                 if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
                     if variants:
                         with self.conn.cursor() as cur:
                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
+                                        (word_id, list(variants)))
                             result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
                             result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            self._cache.housenumbers[word_id] = result
 
         return result
 
 
         return result
 
@@ -620,7 +620,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
@@ -650,15 +650,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
             if analyzer_id is None:
             if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
             else:
             else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                 if not variants:
                     continue
 
                 if not variants:
                     continue
 
@@ -688,7 +688,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
             variant_base = item.get_attr("variant")
 
         if variant_base:
             variant_base = item.get_attr("variant")
 
         if variant_base:
@@ -703,7 +703,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
             variants = {term}
             if analyzer is not None and variant_base:
 
             variants = {term}
             if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",