]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge pull request #3167 from lonvia/explicit-encoding
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 1e3eab98a3ff202d65d97e22b690e4315c0bac3b..799ff559b94599c43e4f66270f82ec94ac0138cc 100644 (file)
@@ -8,7 +8,8 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
-from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+                   Dict, Set, Iterable
 import itertools
 import json
 import logging
 import itertools
 import json
 import logging
@@ -22,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -37,7 +38,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
 
 
 class ICUTokenizer(AbstractTokenizer):
 
 
 class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
@@ -182,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
                                self.loader.make_token_analysis())
 
 
                                self.loader.make_token_analysis())
 
 
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
@@ -323,7 +336,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
@@ -358,7 +371,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 if analyzer is None:
                     variants = [term]
                 else:
                 if analyzer is None:
                     variants = [term]
                 else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                     if term not in variants:
                         variants.append(term)
             else:
                     if term not in variants:
                         variants.append(term)
             else:
@@ -374,7 +387,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
 
 
 
 
 
-    def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                should_replace: bool) -> None:
         """ Replace the search index for special phrases with the new phrases.
             If `should_replace` is True, then the previous set of will be
                                should_replace: bool) -> None:
         """ Replace the search index for special phrases with the new phrases.
             If `should_replace` is True, then the previous set of will be
@@ -430,7 +443,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
@@ -565,24 +578,25 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
-                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                        result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                 if result[0] is None:
                 if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
                     if variants:
                         with self.conn.cursor() as cur:
-                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
-                            result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
+                            self._cache.housenumbers[word_id] = result
 
         return result
 
 
         return result
 
@@ -619,7 +633,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
@@ -649,23 +663,22 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
             if analyzer_id is None:
             if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
             else:
             else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
-                    full, part = cast(Tuple[int, List[int]],
-                                      cur.fetchone()) # type: ignore[no-untyped-call]
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
 
                 self._cache.names[token_id] = (full, part)
 
 
                 self._cache.names[token_id] = (full, part)
 
@@ -687,7 +700,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
             variant_base = item.get_attr("variant")
 
         if variant_base:
             variant_base = item.get_attr("variant")
 
         if variant_base:
@@ -702,7 +715,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
             variants = {term}
             if analyzer is not None and variant_base:
 
             variants = {term}
             if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
@@ -719,7 +732,7 @@ class _TokenInfo:
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
-        self.street_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
@@ -741,7 +754,7 @@ class _TokenInfo:
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
-        if self.street_tokens:
+        if self.street_tokens is not None:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
@@ -775,6 +788,8 @@ class _TokenInfo:
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
+        if self.street_tokens is None:
+            self.street_tokens = set()
         self.street_tokens.update(tokens)
 
 
         self.street_tokens.update(tokens)