reintroduce cutoffs when searching for very frequent words

[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index f52eaadac36eafbd5fa45eea4651c059e89b7d13..93808cc39f3407458bb2d570d2a8740128f2c168 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -106,6 +106,7 @@ class LegacyTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
+        assert config.project_dir is not None
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
                                       config.project_dir / 'module')
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
                                       config.project_dir / 'module')
@@ -127,6 +128,8 @@ class LegacyTokenizer(AbstractTokenizer):
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
+        assert config.project_dir is not None
+
          with connect(self.dsn) as conn:
              self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
  
          with connect(self.dsn) as conn:
              self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
  
@@ -149,6 +152,8 @@ class LegacyTokenizer(AbstractTokenizer):
      def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
      def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
+        assert config.project_dir is not None
+
          with connect(self.dsn) as conn:
              max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
              modulepath = config.DATABASE_MODULE_PATH or \
          with connect(self.dsn) as conn:
              max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
              modulepath = config.DATABASE_MODULE_PATH or \
@@ -193,6 +198,8 @@ class LegacyTokenizer(AbstractTokenizer):
              This is a special migration function for updating existing databases
              to new software versions.
          """
              This is a special migration function for updating existing databases
              to new software versions.
          """
+        assert config.project_dir is not None
+
          self.normalization = config.TERM_NORMALIZATION
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
          self.normalization = config.TERM_NORMALIZATION
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
@@ -203,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
              self._save_config(conn, config)
  
  
              self._save_config(conn, config)
  
  
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
@@ -249,18 +256,29 @@ class LegacyTokenizer(AbstractTokenizer):
          return LegacyNameAnalyzer(self.dsn, normalizer)
  
  
          return LegacyNameAnalyzer(self.dsn, normalizer)
  
  
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute(""" SELECT word FROM word WHERE word is not null
+                              ORDER BY search_name_count DESC LIMIT %s""", (num,))
+            return list(s[0] for s in cur)
+
+
      def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
      def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
-        php_file = self.data_dir / "tokenizer.php"
+        if config.lib_dir.php is not None:
+            php_file = self.data_dir / "tokenizer.php"
  
  
-        if not php_file.exists() or overwrite:
-            php_file.write_text(dedent(f"""\
-                <?php
-                @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
-                @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
-                require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """), encoding='utf-8')
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
+                    @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
+                    require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                    """), encoding='utf-8')
  
  
      def _init_db_tables(self, config: Configuration) -> None:
  
  
      def _init_db_tables(self, config: Configuration) -> None:
@@ -544,8 +562,9 @@ class _TokenInfo:
  
          with conn.cursor() as cur:
              cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
  
          with conn.cursor() as cur:
              cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
-            self.data['hnr_tokens'], self.data['hnr'] = \
-                cur.fetchone() # type: ignore[no-untyped-call]
+            result = cur.fetchone()
+            assert result is not None
+            self.data['hnr_tokens'], self.data['hnr'] = result
  
  
      def set_postcode(self, postcode: str) -> None:
  
  
      def set_postcode(self, postcode: str) -> None:
@@ -556,14 +575,13 @@ class _TokenInfo:
      def add_street(self, conn: Connection, street: str) -> None:
          """ Add addr:street match terms.
          """
      def add_street(self, conn: Connection, street: str) -> None:
          """ Add addr:street match terms.
          """
-        def _get_street(name: str) -> List[int]:
+        def _get_street(name: str) -> Optional[str]:
              with conn.cursor() as cur:
              with conn.cursor() as cur:
-                return cast(List[int],
+                return cast(Optional[str],
                              cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
  
          tokens = self.cache.streets.get(street, _get_street)
                              cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
  
          tokens = self.cache.streets.get(street, _get_street)
-        if tokens:
-            self.data['street'] = tokens
+        self.data['street'] = tokens or '{}'
  
  
      def add_place(self, conn: Connection, place: str) -> None:
  
  
      def add_place(self, conn: Connection, place: str) -> None:
@@ -574,8 +592,7 @@ class _TokenInfo:
                  cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
                  cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
-                return cast(Tuple[List[int], List[int]],
-                            cur.fetchone()) # type: ignore[no-untyped-call]
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
  
          self.data['place_search'], self.data['place_match'] = \
              self.cache.places.get(place, _get_place)
  
          self.data['place_search'], self.data['place_match'] = \
              self.cache.places.get(place, _get_place)
@@ -589,8 +606,7 @@ class _TokenInfo:
                  cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
                  cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
-                return cast(Tuple[List[int], List[int]],
-                            cur.fetchone()) # type: ignore[no-untyped-call]
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
  
          tokens = {}
          for key, value in terms:
  
          tokens = {}
          for key, value in terms: