]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 319838a16849b7bc9d1bdae31b27dab07594eb5a..799ff559b94599c43e4f66270f82ec94ac0138cc 100644 (file)
@@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
                                self.loader.make_token_analysis())
 
 
                                self.loader.make_token_analysis())
 
 
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
@@ -566,8 +578,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
-                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                        result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
@@ -580,9 +593,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                     variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
                     variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
-                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (word_id, list(variants)))
-                            result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
                             self._cache.housenumbers[word_id] = result
 
         return result
                             self._cache.housenumbers[word_id] = result
 
         return result
@@ -665,8 +678,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
-                    full, part = cast(Tuple[int, List[int]],
-                                      cur.fetchone()) # type: ignore[no-untyped-call]
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
 
                 self._cache.names[token_id] = (full, part)
 
 
                 self._cache.names[token_id] = (full, part)
 
@@ -720,7 +732,7 @@ class _TokenInfo:
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
-        self.street_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
@@ -742,7 +754,7 @@ class _TokenInfo:
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
-        if self.street_tokens:
+        if self.street_tokens is not None:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
@@ -776,6 +788,8 @@ class _TokenInfo:
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
+        if self.street_tokens is None:
+            self.street_tokens = set()
         self.street_tokens.update(tokens)
 
 
         self.street_tokens.update(tokens)