indexer: fetch extra place data asynchronously

[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index d4068aea83410f1b78665badee27247eabaa81c1..b1fd9e9673febce83bdc04cedbf38e0482529db9 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -5,7 +5,9 @@ from collections import OrderedDict
  import logging
  import re
  import shutil
+from textwrap import dedent
  
+from icu import Transliterator
  import psycopg2
  import psycopg2.extras
  
@@ -86,7 +88,7 @@ class LegacyTokenizer:
          self.normalization = None
  
  
-    def init_new_db(self, config):
+    def init_new_db(self, config, init_db=True):
          """ Set up a new tokenizer for the database.
  
              This copies all necessary data in the project directory to make
@@ -98,13 +100,16 @@ class LegacyTokenizer:
  
          self.normalization = config.TERM_NORMALIZATION
  
+        self._install_php(config)
+
          with connect(self.dsn) as conn:
              _check_module(module_dir, conn)
              self._save_config(conn, config)
              conn.commit()
  
-        self.update_sql_functions(config)
-        self._init_db_tables(config)
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
  
  
      def init_from_project(self):
@@ -127,6 +132,33 @@ class LegacyTokenizer:
                                modulepath=modulepath)
  
  
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
      def migrate_database(self, config):
          """ Initialise the project directory of an existing database for
              use with this tokenizer.
@@ -134,6 +166,7 @@ class LegacyTokenizer:
              This is a special migration function for updating existing databases
              to new software versions.
          """
+        self.normalization = config.TERM_NORMALIZATION
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
                                       config.project_dir / 'module')
@@ -158,7 +191,21 @@ class LegacyTokenizer:
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyNameAnalyzer(self.dsn)
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
  
  
      def _init_db_tables(self, config):
@@ -182,7 +229,6 @@ class LegacyTokenizer:
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
  
  
-
  class LegacyNameAnalyzer:
      """ The legacy analyzer uses the special Postgresql module for
          splitting names.
@@ -191,9 +237,10 @@ class LegacyNameAnalyzer:
          normalization.
      """
  
-    def __init__(self, dsn):
+    def __init__(self, dsn, normalizer):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
+        self.normalizer = normalizer
          psycopg2.extras.register_hstore(self.conn)
  
          self._cache = _TokenCache(self.conn)
@@ -215,6 +262,13 @@ class LegacyNameAnalyzer:
              self.conn = None
  
  
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
      def add_postcodes_from_db(self):
          """ Add postcodes from the location_postcode table to the word table.
          """
@@ -224,6 +278,47 @@ class LegacyNameAnalyzer:
                                   FROM location_postcode) x""")
  
  
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """