Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 319838a16849b7bc9d1bdae31b27dab07594eb5a..799ff559b94599c43e4f66270f82ec94ac0138cc 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
                                 self.loader.make_token_analysis())
  
  
                                 self.loader.make_token_analysis())
  
  
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
      def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
      def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
@@ -566,8 +578,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                  result = self._cache.housenumbers.get(norm_name, result)
                  if result[0] is None:
                      with self.conn.cursor() as cur:
                  result = self._cache.housenumbers.get(norm_name, result)
                  if result[0] is None:
                      with self.conn.cursor() as cur:
-                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                        result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
                          self._cache.housenumbers[norm_name] = result
          else:
              # Otherwise use the analyzer to determine the canonical name.
                          self._cache.housenumbers[norm_name] = result
          else:
              # Otherwise use the analyzer to determine the canonical name.
@@ -580,9 +593,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                      variants = analyzer.compute_variants(word_id)
                      if variants:
                          with self.conn.cursor() as cur:
                      variants = analyzer.compute_variants(word_id)
                      if variants:
                          with self.conn.cursor() as cur:
-                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (word_id, list(variants)))
-                            result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
                              self._cache.housenumbers[word_id] = result
  
          return result
                              self._cache.housenumbers[word_id] = result
  
          return result
@@ -665,8 +678,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                  (token_id, variants))
-                    full, part = cast(Tuple[int, List[int]],
-                                      cur.fetchone()) # type: ignore[no-untyped-call]
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
  
                  self._cache.names[token_id] = (full, part)
  
  
                  self._cache.names[token_id] = (full, part)
  
@@ -720,7 +732,7 @@ class _TokenInfo:
          self.names: Optional[str] = None
          self.housenumbers: Set[str] = set()
          self.housenumber_tokens: Set[int] = set()
          self.names: Optional[str] = None
          self.housenumbers: Set[str] = set()
          self.housenumber_tokens: Set[int] = set()
-        self.street_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
          self.place_tokens: Set[int] = set()
          self.address_tokens: Dict[str, str] = {}
          self.postcode: Optional[str] = None
          self.place_tokens: Set[int] = set()
          self.address_tokens: Dict[str, str] = {}
          self.postcode: Optional[str] = None
@@ -742,7 +754,7 @@ class _TokenInfo:
              out['hnr'] = ';'.join(self.housenumbers)
              out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
  
              out['hnr'] = ';'.join(self.housenumbers)
              out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
  
-        if self.street_tokens:
+        if self.street_tokens is not None:
              out['street'] = self._mk_array(self.street_tokens)
  
          if self.place_tokens:
              out['street'] = self._mk_array(self.street_tokens)
  
          if self.place_tokens:
@@ -776,6 +788,8 @@ class _TokenInfo:
      def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
      def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
+        if self.street_tokens is None:
+            self.street_tokens = set()
          self.street_tokens.update(tokens)
  
  
          self.street_tokens.update(tokens)